From 362e657ab5f4710cf6bd472ccd14f65c9fa354e3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 11:45:57 +0100 Subject: [PATCH 01/59] handling different format of surveyed windows --- etl/customers/aiha/xml_extraction.py | 60 ++++++++++++++++++++++++++ etl/xml_survey_extraction/XmlParser.py | 34 ++++++++++++++- 2 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 etl/customers/aiha/xml_extraction.py diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py new file mode 100644 index 00000000..d235be78 --- /dev/null +++ b/etl/customers/aiha/xml_extraction.py @@ -0,0 +1,60 @@ +import os +from io import BytesIO +from etl.xml_survey_extraction.XmlParser import XmlParser + +SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" + + +def main(): + """ + This script handles the extraction of data from the XML files in the survey folders. + :return: + """ + # Step 1: List all subfolders inside SURVEY_FOLDER_PATH. + subfolders = [f.path for f in os.scandir(SURVEY_FOLDER_PATH) if f.is_dir()] + + # Step 2: Loop through each subfolder and find the XML files. + extracted_surveys = [] + for subfolder in subfolders: + print(f"Searching in subfolder: {subfolder}") + + # Find all XML files in the current subfolder. + xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')] + + if not xml_files: + raise FileNotFoundError(f"No XML files found in subfolder: {subfolder}") + + # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key. + for xml_file in xml_files: + xml_path = os.path.join(subfolder, xml_file) + print(f"Processing XML file: {xml_path}") + + # Read in the XML and parse it using the XmlParser class. + with open(xml_path, 'rb') as file: + xml_data_io = BytesIO(file.read()) + uprn = None # Set the UPRN if available. + + # Create an XmlParser instance + xml_parser = XmlParser( + file=xml_data_io, + filekey=xml_path, + surveyor_company="", + uprn=uprn, + ) + + # Run the parser to extract the data + xml_parser.run() + + # Store the extracted data for further processing + extracted_surveys.append({ + "epc": xml_parser.epc, + "additional_data": xml_parser.additional_data, + "subfolder": subfolder + }) + + print(f"Extracted {len(extracted_surveys)} surveys.") + # Process the extracted_surveys as needed, for example, save to a database or write to a file. + + +if __name__ == "__main__": + main() diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index ffe191a4..ed3d65d2 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -769,8 +769,6 @@ class XmlParser: :return: """ - sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window") - glazing_type_lookup = { "3": "double glazing, unknown install date", "5": "Single glazing", @@ -787,6 +785,38 @@ class XmlParser: "8": "North West" } + sap_windows = self.xml.getElementsByTagName("SAP-Windows") + + if not sap_windows: + # We look for Multi-Glazed-Proportion + multiple_glazing_type = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "Multiple-Glazing-Type" + )[0].firstChild.nodeValue + + pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "PVC-Window-Frames" + )[0].firstChild.nodeValue + + multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "Multiple-Glazed-Proportion" + )[0].firstChild.nodeValue + + self.windows = [ + { + "window_location": None, + "window_area": None, + "window_type": None, + "glazing_type": glazing_type_lookup[multiple_glazing_type], + "pvc_frame": pvc_frame, + "glazing_gap": None, + "orientation": None, + "multple_glazed_proportion": multple_glazed_proportion + } + ] + return + + sap_windows = sap_windows[0].getElementsByTagName("SAP-Window") + self.windows = [ self._parse_windows_content( window=window, From 323364e0dff03fe5a02c575cce043568eae783e4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 11:51:00 +0100 Subject: [PATCH 02/59] added additional built form to built form map in XmlParser --- etl/xml_survey_extraction/XmlParser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index ed3d65d2..a0ed02e1 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -107,6 +107,7 @@ class XmlParser: BUILT_FORM_MAP = { "1": "Detached", + "2": "Semi-Detached", "3": "End-Terrace", "4": "Mid-Terrace", } @@ -803,7 +804,7 @@ class XmlParser: self.windows = [ { - "window_location": None, + "window_location": "0", "window_area": None, "window_type": None, "glazing_type": glazing_type_lookup[multiple_glazing_type], From 8f8e85c1e1d1fa202f5ec5c4747a92fcde36b292 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 12:01:05 +0100 Subject: [PATCH 03/59] debuggin xml extraction --- etl/xml_survey_extraction/XmlParser.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index a0ed02e1..a4061b3a 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -113,6 +113,7 @@ class XmlParser: } GLAZED_AREA_MAP = { + "2": "More than Typical", "4": "Much More Than Typical" } @@ -121,7 +122,8 @@ class XmlParser: } TRANSACTION_TYPE_MAP = { - "13": "ECO assessment" + "13": "ECO assessment", + "14": "Stock condition survey", } TENURE_MAP = { @@ -401,8 +403,13 @@ class XmlParser: ] wall_areas = sum([float(f["heat_loss_perimeter"]) * float(f["room_height"]) for f in main_dwelling_floors]) - window_areas = sum([float(w["window_area"]) for w in main_dwelling_windows]) - return wall_areas - window_areas + window_areas = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None] + if not window_areas: + # We discount 10% of the wall area + insulation_wall_area = wall_areas * 0.9 + else: + insulation_wall_area = wall_areas - window_areas + return insulation_wall_area def extract_additional_data(self): @@ -416,7 +423,8 @@ class XmlParser: main_dwelling_windows = [w for w in self.windows if w["window_location"] == "0"] number_of_windows = len(main_dwelling_windows) - windows_area = sum([float(w["window_area"]) for w in main_dwelling_windows]) + windows_area = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None] + windows_area = sum(windows_area) if windows_area else None boolean_lookup = { "true": True, @@ -462,7 +470,7 @@ class XmlParser: "cylinder_thermostat": cylinder_thermostat, "main_dwelling_ground_floor_area": float(main_dwelling_ground_floor_area), "number_of_windows": int(number_of_windows), - "windows_area": float(windows_area), + "windows_area": float(windows_area) if windows_area is not None else windows_area, } def get_node_value(self, tag_name): From 60490cd4faf100fe3f66754a23effc8211b1793c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 14:23:20 +0100 Subject: [PATCH 04/59] xml extraction --- etl/xml_survey_extraction/XmlParser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index a4061b3a..a2246629 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -804,7 +804,9 @@ class XmlParser: pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( "PVC-Window-Frames" - )[0].firstChild.nodeValue + ) + + pvc_frame = pvc_frame[0].firstChild.nodeValue if pvc_frame else None multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( "Multiple-Glazed-Proportion" From 9d4a93ca3efa43a66c5d3f13843f4f62386e978c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 15:18:42 +0100 Subject: [PATCH 05/59] debugging xml extraction --- etl/xml_survey_extraction/XmlParser.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index a2246629..f8f2285d 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -9,7 +9,8 @@ from etl.xml_survey_extraction.pcdb import heating_data PROPERTY_TYPE_LOOKUP = { "0": "House", "House": "House", - "2": "Flat" + "2": "Flat", + "3": "Maisonette", } @@ -122,6 +123,7 @@ class XmlParser: } TRANSACTION_TYPE_MAP = { + "5": "Rented (social)", "13": "ECO assessment", "14": "Stock condition survey", } @@ -134,7 +136,8 @@ class XmlParser: TARIFF_MAP = { "1": "Dual", - "2": "Single" + "2": "Single", + "3": "Unknown" } def __init__(self, file, filekey, surveyor_company, uprn=None): @@ -408,7 +411,7 @@ class XmlParser: # We discount 10% of the wall area insulation_wall_area = wall_areas * 0.9 else: - insulation_wall_area = wall_areas - window_areas + insulation_wall_area = wall_areas - sum(window_areas) return insulation_wall_area def extract_additional_data(self): @@ -779,6 +782,7 @@ class XmlParser: """ glazing_type_lookup = { + "2": "double glazing installed during or after 2002", "3": "double glazing, unknown install date", "5": "Single glazing", } From bfded2aaf985b65a5551c7f0f55706d54f36a5f7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 23 Oct 2024 15:25:11 +0100 Subject: [PATCH 06/59] expanding xml extraction --- etl/xml_survey_extraction/XmlParser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index f8f2285d..fa70b6b7 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -439,6 +439,7 @@ class XmlParser: cylinder_insulation_type = { None: "", "1": "Foam", + "2": "Jacket" } cylinder_insulation_thickness = int( @@ -782,6 +783,7 @@ class XmlParser: """ glazing_type_lookup = { + "ND": "Single glazing", "2": "double glazing installed during or after 2002", "3": "double glazing, unknown install date", "5": "Single glazing", From ce9b3e5e2014fdeaba52ecf977618a5b16898a29 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Oct 2024 18:13:28 +0100 Subject: [PATCH 07/59] creating aiha output --- etl/customers/aiha/xml_extraction.py | 452 ++++++++++++++++++++++++++- 1 file changed, 448 insertions(+), 4 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index d235be78..416065e7 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -1,5 +1,8 @@ import os from io import BytesIO + +import pandas as pd + from etl.xml_survey_extraction.XmlParser import XmlParser SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" @@ -22,7 +25,8 @@ def main(): xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')] if not xml_files: - raise FileNotFoundError(f"No XML files found in subfolder: {subfolder}") + print(f"No XML files found in subfolder: {subfolder}") + continue # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key. for xml_file in xml_files: @@ -44,16 +48,456 @@ def main(): # Run the parser to extract the data xml_parser.run() + if not xml_parser.epc: + # If we don't have a lig xml + continue # Store the extracted data for further processing extracted_surveys.append({ - "epc": xml_parser.epc, - "additional_data": xml_parser.additional_data, - "subfolder": subfolder + "survey_key": subfolder.split("/")[-1], + **xml_parser.epc, + **xml_parser.additional_data }) print(f"Extracted {len(extracted_surveys)} surveys.") # Process the extracted_surveys as needed, for example, save to a database or write to a file. + extracted_surveys = pd.DataFrame(extracted_surveys) + + # THis is the data we need for the AIHA project + measures_data = extracted_surveys[ + ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating"] + ] + measures_data = measures_data.sort_values("survey_key", ascending=True) + + # Note: + # The properties will still have "Very poor" ratings for their hot water + + # TODO + # - AIH001-03 has a basement and so we should discount this area from the ground floor + # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft + # - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the + # best option for this property due to it being extrememly large and the walls being uninsulated. It might not + # be performant enough in the winter, when COP will be more like 1.5. + # - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are + # in the property? Does it make sense to have such a large solar PV system (5.6kWp)? + # - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C + # - Generally, should we consider insulated doors? + # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same + # buulding + # - AIH001-09 - The extension is 1900-1929 but has a cavity wall + # - AIH001-09 - Is it not possible to install a loft hatch? + # - AIH001-09 - Why is there assumed secondary heating? + # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? + # - AIH001-11 - The layout of this unit is confusing, is there roof access? + # - AIH001-12 - Why was there not access to the cylinder? + # + + recommended_measures = [ + { + "survey_key": "AIH001-01", + "starting_sap": 69, + "recommended_measures": [], + "notes": "Is EPC C" + }, + { + "survey_key": "AIH001-03", + "starting_sap": 43, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 44, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with various configurations", + "config": [ + { + "size": "4kWp", + "orientation": "East", + "elavation": 30, + "overshading": "Modest", + }, + { + "size": "1.6kWp", + "orientation": "Horizontal", + "elavation": "Horizontal", + "overshading": "Modest", + } + ], + "sap_points": 7, + "ending_sap": 53 + }, + { + "measure": "Loft Insulation", + "description": "300mm of loft insulation", + "sap_points": 8, + "ending_sap": 61 + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 3, + "ending_sap": 64 + } + ], + "notes": "There was no access to the loft for this property and so a loft hatch would need to be " + "installed..." + }, + { + "survey_key": "AIH001-04", + "starting_sap": 48, + "recommended_measures": [ + { + "measure": "Flat Roof Insulation", + "description": "100mm flat roof insulation", + "sap_points": 4, + "ending_sap": 52 + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 3, + "ending_sap": 55 + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 4kW capacity, south-facing", + "config": [ + { + "size": "4kW", + "orientation": "South", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 12, + "ending_sap": 67 + } + ], + "notes": "" + }, + { + "survey_key": "AIH001-05", + "starting_sap": 54, + "recommended_measures": [ + { + "measure": "Flat Roof Insulation", + "description": "100mm flat roof insulation", + "sap_points": 5, + "ending_sap": 59, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 61, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 4kW capacity, horizontal orientation", + "config": [ + { + "size": "4kW", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 9, + "ending_sap": 70 + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 3, + "ending_sap": 73 + } + ], + "notes": "" + }, + { + "survey_key": "AIH001-06", + "starting_sap": 62, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 64, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 2kW capacity, south-facing", + "config": [ + { + "size": "2kW", + "orientation": "South", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 6, + "ending_sap": 70 + } + ] + }, + { + "survey_key": "AIH001-07", + "starting_sap": 74, + "recommended_measures": [], + "notes": "Is EPC C" + }, + { + "survey_key": "AIH001-08", + "starting_sap": 56, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm of loft insulation", + "sap_points": 2, + "ending_sap": 58, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 4, + "ending_sap": 62, + }, + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "sap_points": 5, + "ending_sap": 69, + }, + { + "measure": "Ventilation", + "description": "Ventilation improvement", + "sap_points": 0, + "ending_sap": 69, + } + ] + }, + { + "survey_key": "AIH001-09", + "starting_sap": 44, + "recommended_measures": [ + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "sap_points": 8, + "ending_sap": 52, + }, + { + "measure": "Cavity Wall Insulation", + "description": "Cavity wall insulation for extensions", + "sap_points": 1, + "ending_sap": 53, + }, + { + "measure": "Ventilation", + "description": "Ventilation improvement", + "sap_points": 0, + "ending_sap": 53, + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 3, + "ending_sap": 56, + } + ] + }, + { + "survey_key": "AIH001-11", + "starting_sap": 59, + "recommended_measures": [ + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 4, + "ending_sap": 63, + }, + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "sap_points": 5, + "ending_sap": 68, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 69, + } + ] + }, + { + "survey_key": "AIH001-12", + "starting_sap": 46, + "recommended_measures": [ + { + "measure": "Double Glazing", + "description": "Installation of double glazing", + "sap_points": 2, + "ending_sap": 48, + }, + { + "measure": "Draught Proofing", + "description": "Draught proofing improvements", + "sap_points": 1, + "ending_sap": 49, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 3.2kW capacity, east-facing", + "config": [ + { + "size": "3.2W", + "orientation": "East", + "elavation": 30, + "overshading": "Little or none", + } + ], + "sap_points": 9, + "ending_sap": 58 + }, + { + "measure": "Air Source Heat Pump", + "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump", + "sap_points": 15, + "ending_sap": 73 + }, + { + "measure": "Tariff Review", + "description": "Switch to 24-hour tariff", + "sap_points": 15, + "ending_sap": 88 + } + ] + }, + { + "survey_key": "AIH001-13", + "starting_sap": 53, + "recommended_measures": [ + { + "measure": "Roof Insulation", + "description": "100mm+ insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "sap_points": 6, + "ending_sap": 59, + }, + { + "measure": "Flat Roof Insulation", + "description": "Flat roof insulation", + "sap_points": 2, + "ending_sap": 61, + }, + { + "measure": "Cavity Wall Insulation", + "description": "Cavity wall insulation", + "sap_points": 6, + "ending_sap": 67, + }, + { + "measure": "Ventilation", + "description": "Ventilation improvement", + "sap_points": 0, + "ending_sap": 67, + }, + { + "measure": "TTZC", + "description": "Thermostatic Time Zone Control", + "sap_points": 2, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 4kW capacity, flat roof installation", + "config": [ + { + "size": "4kW", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 9, + "ending_sap": 78 + } + ] + }, + { + "survey_key": "AIH001-14", + "starting_sap": 63, + "recommended_measures": [ + { + "measure": "Cavity Wall Insulation", + "description": "Insulation for cavity walls", + "sap_points": 5, + "ending_sap": 68, + }, + { + "measure": "Ventilation", + "description": "Ventilation improvement", + "sap_points": 0, + "ending_sap": 68, + }, + { + "measure": "Loft Insulation", + "description": "Installation of loft insulation", + "sap_points": 1, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "Solar PV system with 10kW capacity", + "sap_points": 10, + "ending_sap": 79, + } + ] + }, + ] + + # Step 1: Normalize the recommended_measures data into a DataFrame. + normalized_measures = [] + + for survey in recommended_measures: + survey_key = survey["survey_key"] + starting_sap = survey["starting_sap"] + for measure in survey.get("recommended_measures", []): + normalized_measures.append({ + "survey_key": survey_key, + "starting_sap": starting_sap, + "measure": measure["measure"], + "description": measure.get("description", "") + }) + + # Convert the normalized list into a DataFrame. + measures_df = pd.DataFrame(normalized_measures) + + # Step 2: Pivot the measures_df to have a column for each measure type, using the description as values. + pivoted_measures = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="description", + aggfunc=lambda x: ' '.join(x), # Concatenate descriptions if there are multiple entries. + fill_value=None + ).reset_index() + + # Step 3: Extract starting SAP for each survey key. + starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] + + # Merge starting SAP back onto pivoted measures. + result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left") + + # Step 4: Calculate the ending SAP using the total sap points. + # Note: If you want to use total sap points, you'll need to update the total calculation accordingly. + + # Step 5: Merge the result with the measures_data to get the final DataFrame. + final_measures = measures_data.merge( + result_df, how="left", on="survey_key" + ) if __name__ == "__main__": From 56fb33a64a16261f35f286adffc8268503fac24c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Oct 2024 18:39:55 +0100 Subject: [PATCH 08/59] added placeholder pricing sheet --- etl/customers/aiha/xml_extraction.py | 101 ++++++++++++++++++--------- 1 file changed, 68 insertions(+), 33 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 416065e7..563ed7ca 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -90,7 +90,7 @@ def main(): # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? # - AIH001-11 - The layout of this unit is confusing, is there roof access? # - AIH001-12 - Why was there not access to the cylinder? - # + # - AIH001-12 - Is the need to draught proofing due to the windows? recommended_measures = [ { @@ -111,7 +111,7 @@ def main(): }, { "measure": "Solar PV", - "description": "Solar PV system with various configurations", + "description": "4kWp Solar PV system", "config": [ { "size": "4kWp", @@ -131,13 +131,13 @@ def main(): }, { "measure": "Loft Insulation", - "description": "300mm of loft insulation", + "description": "300mm loft insulation", "sap_points": 8, "ending_sap": 61 }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 64 } @@ -157,16 +157,16 @@ def main(): }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 55 }, { "measure": "Solar PV", - "description": "Solar PV system with 4kW capacity, south-facing", + "description": "4kWp Solar PV system", "config": [ { - "size": "4kW", + "size": "4kWp", "orientation": "South", "elavation": 30, "overshading": "Modest", @@ -196,7 +196,7 @@ def main(): }, { "measure": "Solar PV", - "description": "Solar PV system with 4kW capacity, horizontal orientation", + "description": "4kWp Solar PV system", "config": [ { "size": "4kW", @@ -210,7 +210,7 @@ def main(): }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 73 } @@ -229,7 +229,7 @@ def main(): }, { "measure": "Solar PV", - "description": "Solar PV system with 2kW capacity, south-facing", + "description": "2kWp Solar PV system", "config": [ { "size": "2kW", @@ -255,7 +255,7 @@ def main(): "recommended_measures": [ { "measure": "Loft Insulation", - "description": "300mm of loft insulation", + "description": "300mm loft insulation", "sap_points": 2, "ending_sap": 58, }, @@ -273,7 +273,7 @@ def main(): }, { "measure": "Ventilation", - "description": "Ventilation improvement", + "description": "2x DMEV fans", "sap_points": 0, "ending_sap": 69, } @@ -291,19 +291,19 @@ def main(): }, { "measure": "Cavity Wall Insulation", - "description": "Cavity wall insulation for extensions", + "description": "CWI to rdSAP default standard", "sap_points": 1, "ending_sap": 53, }, { "measure": "Ventilation", - "description": "Ventilation improvement", + "description": "2x DMEV fans", "sap_points": 0, "ending_sap": 53, }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 56, } @@ -315,7 +315,7 @@ def main(): "recommended_measures": [ { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 4, "ending_sap": 63, }, @@ -345,13 +345,13 @@ def main(): }, { "measure": "Draught Proofing", - "description": "Draught proofing improvements", + "description": "Window draught proofing improvements", "sap_points": 1, "ending_sap": 49, }, { "measure": "Solar PV", - "description": "Solar PV system with 3.2kW capacity, east-facing", + "description": "3.2kWp Solar PV system", "config": [ { "size": "3.2W", @@ -383,37 +383,37 @@ def main(): "recommended_measures": [ { "measure": "Roof Insulation", - "description": "100mm+ insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)", "sap_points": 6, "ending_sap": 59, }, { "measure": "Flat Roof Insulation", - "description": "Flat roof insulation", + "description": "100mm flat roof insulation", "sap_points": 2, "ending_sap": 61, }, { "measure": "Cavity Wall Insulation", - "description": "Cavity wall insulation", + "description": "CWI to rdSAP default standard", "sap_points": 6, "ending_sap": 67, }, { "measure": "Ventilation", - "description": "Ventilation improvement", + "description": "2x DMEV fans", "sap_points": 0, "ending_sap": 67, }, { "measure": "TTZC", - "description": "Thermostatic Time Zone Control", + "description": "Smart Thermostat", "sap_points": 2, "ending_sap": 69, }, { "measure": "Solar PV", - "description": "Solar PV system with 4kW capacity, flat roof installation", + "description": "4kWp Solar PV system", "config": [ { "size": "4kW", @@ -433,25 +433,25 @@ def main(): "recommended_measures": [ { "measure": "Cavity Wall Insulation", - "description": "Insulation for cavity walls", + "description": "CWI to rdSAP default standard", "sap_points": 5, "ending_sap": 68, }, { "measure": "Ventilation", - "description": "Ventilation improvement", + "description": "2x DMEV fans", "sap_points": 0, "ending_sap": 68, }, { "measure": "Loft Insulation", - "description": "Installation of loft insulation", + "description": "300mm loft insulation", "sap_points": 1, "ending_sap": 69, }, { "measure": "Solar PV", - "description": "Solar PV system with 10kW capacity", + "description": "3.2kWp Solar PV system", "sap_points": 10, "ending_sap": 79, } @@ -459,6 +459,33 @@ def main(): }, ] + descs = [] + for r in recommended_measures: + for m in r["recommended_measures"]: + descs.append(m["description"]) + descs = list(set(descs)) + + # TODO - need to add scaffolding + pricing_data = [ + {'item': '80mm cylinder insulation', 'unit_price': None, 'unit': 'unit'}, + {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'}, + {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'}, + {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'}, + {'item': '100mm flat roof insulation', 'unit_price': None, 'unit': 'floor_m2'}, + {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, + {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, + {'item': 'Installation of double glazing', 'unit_price': None, 'unit': 'window'}, + {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'}, + {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80, + 'unit': 'floor_m2'}, + {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, + {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'}, + {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'}, + {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'} + ] + pricing_data = pd.DataFrame(pricing_data) + # Step 1: Normalize the recommended_measures data into a DataFrame. normalized_measures = [] @@ -470,7 +497,8 @@ def main(): "survey_key": survey_key, "starting_sap": starting_sap, "measure": measure["measure"], - "description": measure.get("description", "") + "description": measure.get("description", ""), + "sap_points": measure.get("sap_points", 0) }) # Convert the normalized list into a DataFrame. @@ -485,16 +513,23 @@ def main(): fill_value=None ).reset_index() - # Step 3: Extract starting SAP for each survey key. + # Step 3: Calculate the total sap points for each survey. + total_sap_points = measures_df.groupby("survey_key")["sap_points"].sum().reset_index() + total_sap_points.columns = ["survey_key", "total_sap_points"] + + # Merge total sap points into the pivoted measures. + pivoted_measures = pd.merge(pivoted_measures, total_sap_points, on="survey_key", how="left") + + # Step 4: Extract starting SAP for each survey key. starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] # Merge starting SAP back onto pivoted measures. result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left") - # Step 4: Calculate the ending SAP using the total sap points. - # Note: If you want to use total sap points, you'll need to update the total calculation accordingly. + # Step 5: Calculate the ending SAP. + result_df["ending_sap"] = result_df["starting_sap"] + result_df["total_sap_points"] - # Step 5: Merge the result with the measures_data to get the final DataFrame. + # Step 6: Merge the result with the measures_data to get the final DataFrame. final_measures = measures_data.merge( result_df, how="left", on="survey_key" ) From 93d375bc7a4f0e845c3bb13c9ff00b4b33fd7ff1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Oct 2024 19:11:40 +0100 Subject: [PATCH 09/59] adding aiha costing --- etl/customers/aiha/xml_extraction.py | 46 +++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 563ed7ca..29ac44c6 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -65,7 +65,7 @@ def main(): # THis is the data we need for the AIHA project measures_data = extracted_surveys[ - ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating"] + ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating", "number_of_floors"] ] measures_data = measures_data.sort_values("survey_key", ascending=True) @@ -459,15 +459,20 @@ def main(): }, ] - descs = [] - for r in recommended_measures: - for m in r["recommended_measures"]: - descs.append(m["description"]) - descs = list(set(descs)) + scaffolding_data = [ + { + "number_of_floors": 2, + "price": 841, + }, + { + "number_of_floors": 3, + "price": 1077, + } + ] - # TODO - need to add scaffolding + # TODO - Need an update cost for cylinder insulation pricing_data = [ - {'item': '80mm cylinder insulation', 'unit_price': None, 'unit': 'unit'}, + {'item': '80mm cylinder insulation', 'unit_price': 50, 'unit': 'unit'}, {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'}, {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'}, {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'}, @@ -486,6 +491,31 @@ def main(): ] pricing_data = pd.DataFrame(pricing_data) + for recommendation in recommended_measures: + + property_data = measures_data[measures_data["survey_key"] == recommendation["survey_key"]].squeeze() + + for measure in recommendation["recommended_measures"]: + measure_pricing = pricing_data[pricing_data["item"] == measure["description"]] + measure_unit = measure_pricing["unit"].values[0] + if measure_unit is None: + blah + continue + + if measure_unit == "unit": + measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + continue + + if measure_unit == "unit_needs_scaffolding": + # We need the number of floors + n_floors = property_data["number_of_floors"] + cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] + measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding + + blah + + measure["total"] = pricing_data[pricing_data["item"] == measure["measure"]]["unit_price"].values[0] + # Step 1: Normalize the recommended_measures data into a DataFrame. normalized_measures = [] From 854c784bd9c4341546ea57d2a0549b40552fbd92 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Oct 2024 19:32:15 +0100 Subject: [PATCH 10/59] working on the costing methodology --- etl/customers/aiha/xml_extraction.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 29ac44c6..4d4705c9 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -3,6 +3,7 @@ from io import BytesIO import pandas as pd +from etl.ownership.config import EXCLUDED_UPRNS from etl.xml_survey_extraction.XmlParser import XmlParser SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" @@ -91,6 +92,7 @@ def main(): # - AIH001-11 - The layout of this unit is confusing, is there roof access? # - AIH001-12 - Why was there not access to the cylinder? # - AIH001-12 - Is the need to draught proofing due to the windows? + # - AIH001-04 - is the flat roof area correct? recommended_measures = [ { @@ -132,6 +134,7 @@ def main(): { "measure": "Loft Insulation", "description": "300mm loft insulation", + "floor_area": 80, # Based on area of 1st floor "sap_points": 8, "ending_sap": 61 }, @@ -152,6 +155,7 @@ def main(): { "measure": "Flat Roof Insulation", "description": "100mm flat roof insulation", + "floor_area": 39.1482, # based on area of top floor "sap_points": 4, "ending_sap": 52 }, @@ -185,6 +189,7 @@ def main(): { "measure": "Flat Roof Insulation", "description": "100mm flat roof insulation", + "floor_area": 49.48, # based on area of top floor "sap_points": 5, "ending_sap": 59, }, @@ -256,6 +261,7 @@ def main(): { "measure": "Loft Insulation", "description": "300mm loft insulation", + "floor_area": 54.2864, # Based on area of top floor "sap_points": 2, "ending_sap": 58, }, @@ -390,6 +396,7 @@ def main(): { "measure": "Flat Roof Insulation", "description": "100mm flat roof insulation", + "floor_area": 33.06, # Based on area of the extension "sap_points": 2, "ending_sap": 61, }, @@ -445,7 +452,8 @@ def main(): }, { "measure": "Loft Insulation", - "description": "300mm loft insulation", + "description": "300mm loft insulation", # Based on area of main building + "floor_area": 59.20, "sap_points": 1, "ending_sap": 69, }, @@ -511,10 +519,18 @@ def main(): n_floors = property_data["number_of_floors"] cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding + continue - blah + if measure_unit == "floor_m2": + floor_area = measure["floor_area"] + measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * floor_area + continue - measure["total"] = pricing_data[pricing_data["item"] == measure["measure"]]["unit_price"].values[0] + if measure_unit == "hlp_m2": + hlp = measure["hlp"] + measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * hlp + + raise Exception("Unknown unit type") # Step 1: Normalize the recommended_measures data into a DataFrame. normalized_measures = [] From 8325f1bf7a7bcf0cb7ebd94f6a83c49684163e17 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 10:18:53 +0000 Subject: [PATCH 11/59] Finished costings WIP --- etl/customers/aiha/xml_extraction.py | 76 ++++++++++++++++------------ 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 4d4705c9..c246105a 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -3,10 +3,10 @@ from io import BytesIO import pandas as pd -from etl.ownership.config import EXCLUDED_UPRNS from etl.xml_survey_extraction.XmlParser import XmlParser SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" +CONTINGENCY_RATE = 0.26 def main(): @@ -274,6 +274,7 @@ def main(): { "measure": "Internal Wall Insulation", "description": "100mm internal wall insulation", + "hlp": 24.13 * 2.63, "sap_points": 5, "ending_sap": 69, }, @@ -292,12 +293,14 @@ def main(): { "measure": "Internal Wall Insulation", "description": "100mm internal wall insulation", + "hlp": (22.35 * 3.24) + (22.13 * 2.53), "sap_points": 8, "ending_sap": 52, }, { "measure": "Cavity Wall Insulation", "description": "CWI to rdSAP default standard", + "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39), # 1st & 2nd extension "sap_points": 1, "ending_sap": 53, }, @@ -328,6 +331,7 @@ def main(): { "measure": "Internal Wall Insulation", "description": "100mm internal wall insulation", + "hlp": (18.50 * 3.12) + (19.00 * 2.75), "sap_points": 5, "ending_sap": 68, }, @@ -346,12 +350,15 @@ def main(): { "measure": "Double Glazing", "description": "Installation of double glazing", + "n_windows": 20, # Counted the bay windows each as 3 + "windows_area": 10.66, "sap_points": 2, "ending_sap": 48, }, { "measure": "Draught Proofing", "description": "Window draught proofing improvements", + "n_windows": 20, # Counted the bay windows each as 3 "sap_points": 1, "ending_sap": 49, }, @@ -390,6 +397,7 @@ def main(): { "measure": "Roof Insulation", "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "floor_area": 39.75, # based on the floor area of the RIR "sap_points": 6, "ending_sap": 59, }, @@ -403,6 +411,7 @@ def main(): { "measure": "Cavity Wall Insulation", "description": "CWI to rdSAP default standard", + "hlp": (35.40 * 2.65) + (26.70 * 2.73) + (16.30 * 2.71), # 1st & 2nd extension "sap_points": 6, "ending_sap": 67, }, @@ -441,6 +450,7 @@ def main(): { "measure": "Cavity Wall Insulation", "description": "CWI to rdSAP default standard", + "hlp": (11.00 * 2.6) + (11.00 * 2.65) + (4.60 * 2.7), "sap_points": 5, "ending_sap": 68, }, @@ -483,11 +493,11 @@ def main(): {'item': '80mm cylinder insulation', 'unit_price': 50, 'unit': 'unit'}, {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'}, {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'}, - {'item': 'Window draught proofing improvements', 'unit_price': None, 'unit': 'unit'}, - {'item': '100mm flat roof insulation', 'unit_price': None, 'unit': 'floor_m2'}, + {'item': 'Window draught proofing improvements', 'unit_price': 63, 'unit': 'window'}, + {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'}, {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, - {'item': 'Installation of double glazing', 'unit_price': None, 'unit': 'window'}, + {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'}, {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'}, {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80, @@ -500,51 +510,49 @@ def main(): pricing_data = pd.DataFrame(pricing_data) for recommendation in recommended_measures: - property_data = measures_data[measures_data["survey_key"] == recommendation["survey_key"]].squeeze() + total_cost = 0 for measure in recommendation["recommended_measures"]: measure_pricing = pricing_data[pricing_data["item"] == measure["description"]] measure_unit = measure_pricing["unit"].values[0] - if measure_unit is None: - blah - continue - if measure_unit == "unit": - measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) - continue - - if measure_unit == "unit_needs_scaffolding": - # We need the number of floors + if measure_unit in ["unit", None]: + measure_cost = float(measure_pricing["unit_price"].values[0]) + elif measure_unit == "unit_needs_scaffolding": n_floors = property_data["number_of_floors"] - cost_of_scalfolding = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] - measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) + cost_of_scalfolding - continue + scaffolding_cost = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] + measure_cost = float(measure_pricing["unit_price"].values[0]) + scaffolding_cost + elif measure_unit == "floor_m2": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["floor_area"] + elif measure_unit == "hlp_m2": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["hlp"] + elif measure_unit == "window": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["n_windows"] + else: + raise Exception("Unknown unit type") - if measure_unit == "floor_m2": - floor_area = measure["floor_area"] - measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * floor_area - continue + measure["Total Cost"] = measure_cost + total_cost += measure_cost - if measure_unit == "hlp_m2": - hlp = measure["hlp"] - measure["Total Cost"] = float(measure_pricing["unit_price"].values[0]) * hlp - - raise Exception("Unknown unit type") + recommendation["total_cost"] = total_cost # Step 1: Normalize the recommended_measures data into a DataFrame. normalized_measures = [] - for survey in recommended_measures: survey_key = survey["survey_key"] starting_sap = survey["starting_sap"] + total_cost = survey.get("total_cost", 0) + for measure in survey.get("recommended_measures", []): normalized_measures.append({ "survey_key": survey_key, "starting_sap": starting_sap, "measure": measure["measure"], "description": measure.get("description", ""), - "sap_points": measure.get("sap_points", 0) + "sap_points": measure.get("sap_points", 0), + "measure_cost": measure.get("Total Cost", 0), + "total_cost": total_cost }) # Convert the normalized list into a DataFrame. @@ -559,12 +567,16 @@ def main(): fill_value=None ).reset_index() - # Step 3: Calculate the total sap points for each survey. - total_sap_points = measures_df.groupby("survey_key")["sap_points"].sum().reset_index() - total_sap_points.columns = ["survey_key", "total_sap_points"] + # Step 3: Calculate the total sap points and total cost for each survey. + sap_cost_totals = measures_df.groupby("survey_key").agg( + total_sap_points=("sap_points", "sum"), + total_cost_of_measures=("measure_cost", "sum") + ).reset_index() # Merge total sap points into the pivoted measures. - pivoted_measures = pd.merge(pivoted_measures, total_sap_points, on="survey_key", how="left") + pivoted_measures = pd.merge(pivoted_measures, sap_cost_totals, on="survey_key", how="left") + pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE + pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"] # Step 4: Extract starting SAP for each survey key. starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] From 7513e475d3cac3a21a95b0096833a43914ee7974 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 10:57:26 +0000 Subject: [PATCH 12/59] adding in the basic structure of the extraction code --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../stonewater/Wave 3 Preparation.py | 92 +++++++++++++++++++ .../requirements/requirements-wave-3-prep.txt | 1 + 4 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 etl/customers/stonewater/Wave 3 Preparation.py create mode 100644 etl/customers/stonewater/requirements/requirements-wave-3-prep.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py new file mode 100644 index 00000000..bd916494 --- /dev/null +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -0,0 +1,92 @@ +import os +import PyPDF2 +import re + +FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" + + +def extract_summary_report(pdf_path): + """ + Extracts specific data from the provided PDF file. + Data includes: + - Current SAP rating + - Fuel Bill + - Emissions (t/year) + """ + data = { + "Current SAP rating": None, + "Fuel Bill": None, + "Emissions (t/year)": None, + } + + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Extract Current SAP rating + sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) + if sap_match: + data["Current SAP rating"] = sap_match.group(1) + + # Extract Fuel Bill + fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) + if fuel_bill_match: + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + # Extract Emissions + emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text) + if emissions_match: + data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes" + + return data + + +def main(): + """ + This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. + """ + # List only directories in the specified FILE_PATH + survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] + + extracted_data = [] + for survey_folder in survey_folders: + # List the folders inside of the survey folder + survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder)) + if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))] + + if not survey_subfolders: + continue + + # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment: + # If it exists, we will use the data from that folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + # List contents of the retrofit folder + retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder)) + + if not retrofit_files: + continue + + # We now look for specific files: + # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is + # .pdf + summary_report = next( + (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + if summary_report is not None: + pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report) + summary_data = extract_summary_report(pdf_path) + summary_data = { + "survey_folder": survey_folder, + **summary_data + } + extracted_data.append(summary_data) + continue + + raise NotImplementedError("IMPLEMENT ME!") + + +if __name__ == "__main__": + main() diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt new file mode 100644 index 00000000..e9a5c8ea --- /dev/null +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -0,0 +1 @@ +PyPDF2 From 0332c77098b4b77576422eb6b1cf1898f0ed79c3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 11:21:54 +0000 Subject: [PATCH 13/59] [Crefactoring structure of extraction code --- .../stonewater/Wave 3 Preparation.py | 80 +++++++++++++------ 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bd916494..976a953f 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -43,6 +43,42 @@ def extract_summary_report(pdf_path): return data +def extract_retrofit_assessment_folder(retrofit_folder_path): + """ + Handles extraction from a retrofit assessment folder if it exists and has content. + """ + retrofit_files = os.listdir(retrofit_folder_path) + + # Find the summary report in the retrofit folder + summary_report = next( + (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + + if summary_report: + pdf_path = os.path.join(retrofit_folder_path, summary_report) + return extract_summary_report(pdf_path) + + return None # If no relevant PDF is found + + +def extract_from_survey_folder_files(survey_folder_path): + """ + Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. + """ + survey_files = os.listdir(survey_folder_path) + + # Look for a summary report directly in the survey folder + summary_report = next( + (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + + if summary_report: + pdf_path = os.path.join(survey_folder_path, summary_report) + return extract_summary_report(pdf_path) + + return None # If no relevant PDF is found + + def main(): """ This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. @@ -52,40 +88,38 @@ def main(): extracted_data = [] for survey_folder in survey_folders: + survey_folder_path = os.path.join(FILE_PATH, survey_folder) + # List the folders inside of the survey folder - survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder)) - if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))] + survey_subfolders = [name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name))] - if not survey_subfolders: - continue - - # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment: - # If it exists, we will use the data from that folder + # Check if there's a "retrofit assessment" folder retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) - # List contents of the retrofit folder - retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder)) + # If retrofit assessment folder exists, check if it has content + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_assessment_folder(retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data + } + extracted_data.append(summary_data) + continue - if not retrofit_files: - continue - - # We now look for specific files: - # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is - # .pdf - summary_report = next( - (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None - ) - if summary_report is not None: - pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report) - summary_data = extract_summary_report(pdf_path) + # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_from_survey_folder_files(survey_folder_path) + if summary_data: summary_data = { "survey_folder": survey_folder, **summary_data } extracted_data.append(summary_data) - continue - raise NotImplementedError("IMPLEMENT ME!") + print("Extracted Data:", extracted_data) if __name__ == "__main__": From cf2a94cb365b3903a733653136ae793b6a8299a4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 12:04:57 +0000 Subject: [PATCH 14/59] extracting epr --- .../stonewater/Wave 3 Preparation.py | 94 +++++++++++++++++-- 1 file changed, 84 insertions(+), 10 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 976a953f..53d5bb34 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -43,6 +43,65 @@ def extract_summary_report(pdf_path): return data +def extract_epr(pdf_path): + """ + Extracts specific data from an Energy Report (EPR) PDF file. + """ + data = { + "Address": None, + "Estimated Annual Costs": None, + "Current SAP": None, + "Space Heating": None, + "Water Heating": None, + "Fuel Bill": None, + } + + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Extract Address + address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + data["Address"] = address_match.group(1).strip() + + # Extract Total Floor Area + area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) + data["Total Floor Area"] = area_match.group(1) + + # Extract Estimated Annual Costs + cost_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Estimated Annual Costs"] = f"£{cost_match.group(1)}" + + # Extract Current SAP rating + # Updated Regular Expression to find "GG (1-20)" followed by two numbers + sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text) + + # Extract and validate the Current and Potential SAP ratings + current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) + # Ensure potential is greater than or equal to current + if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: + data["Current SAP"] = current_sap + data["Potential SAP"] = potential_sap + else: + raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") + + # Extract Space Heating (kWh) + space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text) + data["Space Heating"] = int(space_heating_match.group(1)) + + # Extract Water Heating (kWh) + water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text) + data["Water Heating"] = int(water_heating_match.group(1)) + + # Extract Fuel Bill (total estimated costs) + fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + return data + + def extract_retrofit_assessment_folder(retrofit_folder_path): """ Handles extraction from a retrofit assessment folder if it exists and has content. @@ -61,22 +120,38 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): return None # If no relevant PDF is found +def is_energy_report(text): + """ + Determines if the provided text indicates that the PDF is an Energy Report. + Returns True if the text contains 'Energy Report'. + """ + return text.startswith("ENERGY REPORT") + + def extract_from_survey_folder_files(survey_folder_path): """ Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. """ - survey_files = os.listdir(survey_folder_path) + survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")] - # Look for a summary report directly in the survey folder - summary_report = next( - (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None - ) + for pdf_file in survey_files: + pdf_path = os.path.join(survey_folder_path, pdf_file) - if summary_report: - pdf_path = os.path.join(survey_folder_path, summary_report) - return extract_summary_report(pdf_path) + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" - return None # If no relevant PDF is found + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower(): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) + else: + raise NotImplementedError("Implement me") + + return None def main(): @@ -109,7 +184,6 @@ def main(): } extracted_data.append(summary_data) continue - # If no retrofit folder or it was empty, check files in survey_folder summary_data = extract_from_survey_folder_files(survey_folder_path) if summary_data: From 33ea47e71d8b0a226629400dca5b6400b46daf96 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 12:42:28 +0000 Subject: [PATCH 15/59] fixed address extraction --- .../stonewater/Wave 3 Preparation.py | 47 ++++++++++++++----- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 53d5bb34..bc567bd2 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,6 +1,7 @@ import os import PyPDF2 import re +import pandas as pd FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -11,12 +12,12 @@ def extract_summary_report(pdf_path): Data includes: - Current SAP rating - Fuel Bill - - Emissions (t/year) + - Address """ data = { - "Current SAP rating": None, + "Address": None, + "Current SAP Rating": None, "Fuel Bill": None, - "Emissions (t/year)": None, } with open(pdf_path, "rb") as file: @@ -28,17 +29,36 @@ def extract_summary_report(pdf_path): # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) if sap_match: - data["Current SAP rating"] = sap_match.group(1) + data["Current SAP Rating"] = sap_match.group(1) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) if fuel_bill_match: data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" - # Extract Emissions - emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text) - if emissions_match: - data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes" + # Extract individual address components + postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + + # Clean extracted values and remove any prefixes + address_parts = [ + house_no.group(1).strip() if house_no else "", + house_name.group(1).strip() if house_name else "", + street.group(1).strip() if street else "", + locality.group(1).strip() if locality else "", + town.group(1).strip() if town else "", + county.group(1).strip() if county else "", + postcode.group(1).strip() if postcode else "" + ] + + # Join non-empty parts with a comma + data["Address"] = ", ".join([part for part in address_parts if part]) return data @@ -49,8 +69,7 @@ def extract_epr(pdf_path): """ data = { "Address": None, - "Estimated Annual Costs": None, - "Current SAP": None, + "Current SAP Rating": None, "Space Heating": None, "Water Heating": None, "Fuel Bill": None, @@ -82,8 +101,8 @@ def extract_epr(pdf_path): current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) # Ensure potential is greater than or equal to current if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: - data["Current SAP"] = current_sap - data["Potential SAP"] = potential_sap + data["Current SAP Rating"] = current_sap + data["Potential SAP Rating"] = potential_sap else: raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") @@ -117,6 +136,8 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): pdf_path = os.path.join(retrofit_folder_path, summary_report) return extract_summary_report(pdf_path) + raise Exception("Not Implemented") + return None # If no relevant PDF is found @@ -193,7 +214,7 @@ def main(): } extracted_data.append(summary_data) - print("Extracted Data:", extracted_data) + extracted_data = pd.DataFrame(extracted_data) if __name__ == "__main__": diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index e9a5c8ea..2cabb047 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1 +1,2 @@ PyPDF2 +pandas From c68e4f017e48f4cb12639cbd9f69ce40849e68fd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 12:43:59 +0000 Subject: [PATCH 16/59] additional data cleaning --- etl/customers/stonewater/Wave 3 Preparation.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bc567bd2..c6736ba8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -86,12 +86,8 @@ def extract_epr(pdf_path): data["Address"] = address_match.group(1).strip() # Extract Total Floor Area - area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) - data["Total Floor Area"] = area_match.group(1) - - # Extract Estimated Annual Costs - cost_match = re.search(r"TOTAL\s*£(\d+)", text) - data["Estimated Annual Costs"] = f"£{cost_match.group(1)}" + # area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) + # data["Total Floor Area"] = area_match.group(1) # Extract Current SAP rating # Updated Regular Expression to find "GG (1-20)" followed by two numbers @@ -216,6 +212,5 @@ def main(): extracted_data = pd.DataFrame(extracted_data) - -if __name__ == "__main__": - main() +# if __name__ == "__main__": +# main() From 70d02075cf1da79ccce4950cb8080a9b05745a6d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 14:16:33 +0000 Subject: [PATCH 17/59] allowing extract_retrofit_assessment_folder to handle eprs --- .../stonewater/Wave 3 Preparation.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c6736ba8..14e50460 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -121,20 +121,25 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): """ Handles extraction from a retrofit assessment folder if it exists and has content. """ - retrofit_files = os.listdir(retrofit_folder_path) + retrofit_files = [f for f in os.listdir(retrofit_folder_path) if f.endswith(".pdf")] - # Find the summary report in the retrofit folder - summary_report = next( - (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None - ) + for pdf_file in retrofit_files: + pdf_path = os.path.join(retrofit_folder_path, pdf_file) - if summary_report: - pdf_path = os.path.join(retrofit_folder_path, summary_report) - return extract_summary_report(pdf_path) + # Attempt to read the first page of the PDF to determine the report type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" - raise Exception("Not Implemented") + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower(): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) - return None # If no relevant PDF is found + # If no relevant PDF is found, raise an exception + raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") def is_energy_report(text): From 371f17f87e986a5d70ae7b0e66f9748f82adac6e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 14:20:33 +0000 Subject: [PATCH 18/59] adding additional catch for summary report --- etl/customers/stonewater/Wave 3 Preparation.py | 14 +++++++++++++- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 14e50460..dc71d449 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2,6 +2,7 @@ import os import PyPDF2 import re import pandas as pd +from tqdm import tqdm FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -137,6 +138,10 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): elif "summary" in pdf_file.lower(): # Treat this as a Summary Report return extract_summary_report(pdf_path) + elif is_summary_report(first_page_text): + # other ways to detect a summary report + # Treat this as a Summary Report + return extract_summary_report(pdf_path) # If no relevant PDF is found, raise an exception raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") @@ -150,6 +155,13 @@ def is_energy_report(text): return text.startswith("ENERGY REPORT") +def is_summary_report(text): + """ + Determines if the provided text indicates that the PDF is a Summary Report. + """ + return text.startswith("Summary Information") + + def extract_from_survey_folder_files(survey_folder_path): """ Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. @@ -184,7 +196,7 @@ def main(): survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] extracted_data = [] - for survey_folder in survey_folders: + for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(FILE_PATH, survey_folder) # List the folders inside of the survey folder diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 2cabb047..70bec3cc 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1,2 +1,3 @@ PyPDF2 pandas +tqdm From 4e9acdeb8e2222b7c44c05749667fe258fa87982 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 14:23:34 +0000 Subject: [PATCH 19/59] refactored --- .../stonewater/Wave 3 Preparation.py | 67 +++++++------------ 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index dc71d449..30a23e86 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -118,30 +118,15 @@ def extract_epr(pdf_path): return data -def extract_retrofit_assessment_folder(retrofit_folder_path): +def extract_retrofit_pdfs(data_folder_path): """ - Handles extraction from a retrofit assessment folder if it exists and has content. + Handles extraction from a retrofit data folder if it exists and has content. """ - retrofit_files = [f for f in os.listdir(retrofit_folder_path) if f.endswith(".pdf")] + retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")] for pdf_file in retrofit_files: - pdf_path = os.path.join(retrofit_folder_path, pdf_file) - - # Attempt to read the first page of the PDF to determine the report type - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - first_page_text = reader.pages[0].extract_text() if reader.pages else "" - - if is_energy_report(first_page_text): - # Treat this as an Energy Report - return extract_epr(pdf_path) - elif "summary" in pdf_file.lower(): - # Treat this as a Summary Report - return extract_summary_report(pdf_path) - elif is_summary_report(first_page_text): - # other ways to detect a summary report - # Treat this as a Summary Report - return extract_summary_report(pdf_path) + pdf_path = os.path.join(data_folder_path, pdf_file) + return detect_and_parse_report(pdf_path, pdf_file) # If no relevant PDF is found, raise an exception raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") @@ -162,30 +147,26 @@ def is_summary_report(text): return text.startswith("Summary Information") -def extract_from_survey_folder_files(survey_folder_path): +def detect_and_parse_report(pdf_path, pdf_file): """ - Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. + Detects the type of report and extracts the relevant data. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: """ - survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")] + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" - for pdf_file in survey_files: - pdf_path = os.path.join(survey_folder_path, pdf_file) - - # Attempt to read the first page of the PDF to determine type - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - first_page_text = reader.pages[0].extract_text() if reader.pages else "" - - if is_energy_report(first_page_text): - # Treat this as an Energy Report - return extract_epr(pdf_path) - elif "summary" in pdf_file.lower(): - # Treat this as a Summary Report - return extract_summary_report(pdf_path) - else: - raise NotImplementedError("Implement me") - - return None + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower(): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) + else: + raise NotImplementedError("Implement me") def main(): @@ -210,7 +191,7 @@ def main(): if retrofit_folder: retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) if os.listdir(retrofit_folder_path): # If not empty - summary_data = extract_retrofit_assessment_folder(retrofit_folder_path) + summary_data = extract_retrofit_pdfs(retrofit_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, @@ -219,7 +200,7 @@ def main(): extracted_data.append(summary_data) continue # If no retrofit folder or it was empty, check files in survey_folder - summary_data = extract_from_survey_folder_files(survey_folder_path) + summary_data = extract_retrofit_pdfs(survey_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, From 1db4c4319e2b7992405fb977705a90e8b3fb8618 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 14:28:27 +0000 Subject: [PATCH 20/59] removing raising of exception at end of function --- etl/customers/stonewater/Wave 3 Preparation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 30a23e86..777f96c5 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -128,8 +128,8 @@ def extract_retrofit_pdfs(data_folder_path): pdf_path = os.path.join(data_folder_path, pdf_file) return detect_and_parse_report(pdf_path, pdf_file) - # If no relevant PDF is found, raise an exception - raise FileNotFoundError("No valid report (EPR or Summary) found in the retrofit assessment folder.") + # If no relevant PDF is found, exit + return None def is_energy_report(text): @@ -199,6 +199,10 @@ def main(): } extracted_data.append(summary_data) continue + else: + # Then we have an empty Retrofit Assessment folder + continue + # If no retrofit folder or it was empty, check files in survey_folder summary_data = extract_retrofit_pdfs(survey_folder_path) if summary_data: From 2a17831c7223e7614c6413c2f2b4fa09aca3d3a9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 17:16:27 +0000 Subject: [PATCH 21/59] added detection of condition report --- etl/customers/aiha/xml_extraction.py | 26 ++++++++++--------- .../stonewater/Wave 3 Preparation.py | 18 ++++++++++--- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index c246105a..038e8593 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -74,25 +74,26 @@ def main(): # The properties will still have "Very poor" ratings for their hot water # TODO - # - AIH001-03 has a basement and so we should discount this area from the ground floor # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft + # [Can't remember, not clear - Chenai will check] # - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the # best option for this property due to it being extrememly large and the walls being uninsulated. It might not # be performant enough in the winter, when COP will be more like 1.5. # - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are # in the property? Does it make sense to have such a large solar PV system (5.6kWp)? # - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C - # - Generally, should we consider insulated doors? + # - Potential measure - search for the cylinder and insulate it # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same - # buulding - # - AIH001-09 - The extension is 1900-1929 but has a cavity wall - # - AIH001-09 - Is it not possible to install a loft hatch? - # - AIH001-09 - Why is there assumed secondary heating? + # buulding [Question for Lewis & Kevin] + # - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from + # the other unit] + # - AIH001-09 - Why is there assumed secondary heating? [Question for Lewis & Kevin] # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? - # - AIH001-11 - The layout of this unit is confusing, is there roof access? - # - AIH001-12 - Why was there not access to the cylinder? - # - AIH001-12 - Is the need to draught proofing due to the windows? - # - AIH001-04 - is the flat roof area correct? + # [Question for Lewis & Kevin] + # - AIH001-11 - The layout of this unit is confusing, is there roof access? [NO!!!! - It's a Sun room!!] + # - AIH001-12 - Why was there not access to the cylinder? [Sealed shut] + # - AIH001-12 - Is the need to draught proofing due to the windows? [This would be addressed by deailing with the + # windows] recommended_measures = [ { @@ -113,7 +114,7 @@ def main(): }, { "measure": "Solar PV", - "description": "4kWp Solar PV system", + "description": "5.6kWp Solar PV system", "config": [ { "size": "4kWp", @@ -497,6 +498,7 @@ def main(): {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'}, {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, + {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'}, {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'}, {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, @@ -505,7 +507,7 @@ def main(): {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'}, {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'}, - {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'} + {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}, ] pricing_data = pd.DataFrame(pricing_data) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 777f96c5..62cec009 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -126,7 +126,10 @@ def extract_retrofit_pdfs(data_folder_path): for pdf_file in retrofit_files: pdf_path = os.path.join(data_folder_path, pdf_file) - return detect_and_parse_report(pdf_path, pdf_file) + extracted = detect_and_parse_report(pdf_path, pdf_file) + if extracted is not None: + return extracted + continue # If no relevant PDF is found, exit return None @@ -165,10 +168,19 @@ def detect_and_parse_report(pdf_path, pdf_file): elif "summary" in pdf_file.lower(): # Treat this as a Summary Report return extract_summary_report(pdf_path) + elif is_condition_report(first_page_text): + return None else: raise NotImplementedError("Implement me") +def is_condition_report(text): + """ + Determines if the provided text indicates that the PDF is a Condition Report. + """ + return text.startswith("OsmosisACDNEWPAS2035ConditionReport") + + def main(): """ This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. @@ -191,7 +203,7 @@ def main(): if retrofit_folder: retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) if os.listdir(retrofit_folder_path): # If not empty - summary_data = extract_retrofit_pdfs(retrofit_folder_path) + summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, @@ -204,7 +216,7 @@ def main(): continue # If no retrofit folder or it was empty, check files in survey_folder - summary_data = extract_retrofit_pdfs(survey_folder_path) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, From 54b09e88e15cfd6c824beff23f878525cb9d5d16 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 17:20:05 +0000 Subject: [PATCH 22/59] added usage of is_summary_report --- etl/customers/stonewater/Wave 3 Preparation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 62cec009..988a544a 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -165,7 +165,7 @@ def detect_and_parse_report(pdf_path, pdf_file): if is_energy_report(first_page_text): # Treat this as an Energy Report return extract_epr(pdf_path) - elif "summary" in pdf_file.lower(): + elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): # Treat this as a Summary Report return extract_summary_report(pdf_path) elif is_condition_report(first_page_text): From 6e8d9a025cc5b64c1a632bd9c95de140e9e58f82 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 19:26:14 +0000 Subject: [PATCH 23/59] adjusting search epc function to handle pydantic issues for the moment --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/SearchEpc.py | 10 +- .../livewest/route_march_2024_10_28.py | 171 ++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 2 + 5 files changed, 178 insertions(+), 9 deletions(-) create mode 100644 etl/customers/livewest/route_march_2024_10_28.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..850c0cda 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..e4070118 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 367d8c85..f9e978c6 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -256,16 +256,12 @@ class SearchEpc: else: params = {"address": self.address1, "postcode": self.postcode} + url = os.path.join(self.client.domestic.host, "search") + for retry in range(self.max_retries): try: - if "uprn" in params: - # We use the direct call method inside, since we need to implement uprn as a valid - # parameter for the search function - url = os.path.join(self.client.domestic.host, "search") - response = self.client.domestic.call(method="get", url=url, params=params) - else: - response = self.client.domestic.search(params=params, size=size) + response = self.client.domestic.call(method="get", url=url, params=params) if response: self.data = response diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py new file mode 100644 index 00000000..fff1e7e7 --- /dev/null +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -0,0 +1,171 @@ +import os + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/LIVEWEST 3578 ECO4 ECO PLUS GBIS.xlsx", header=0 + ) + + epc_data = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "asset_list_address": full_address, + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + + epc_df = pd.DataFrame(epc_data) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "asset_list_address", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description" + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + left_on=["ADDRESS"], + right_on=["asset_list_address"] + ) + + asset_list = asset_list.drop(columns=["asset_list_address"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]), axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"], + axis=1 + ) + + # Store as an excel + filename = "LHP EPC Data pull.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 988a544a..8e1a7fdb 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -226,5 +226,7 @@ def main(): extracted_data = pd.DataFrame(extracted_data) + missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()] + # if __name__ == "__main__": # main() From 86ca5b40074015c20dd35fe38eda7ac3799139f4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 19:50:09 +0000 Subject: [PATCH 24/59] addded catch for condition report --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../livewest/route_march_2024_10_28.py | 69 ++++++++++--------- .../stonewater/Wave 3 Preparation.py | 2 +- 4 files changed, 40 insertions(+), 35 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 850c0cda..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index e4070118..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py index fff1e7e7..47b86e89 100644 --- a/etl/customers/livewest/route_march_2024_10_28.py +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -1,4 +1,5 @@ import os +import time import pandas as pd from tqdm import tqdm @@ -46,42 +47,46 @@ def app(): ) epc_data = [] + errors = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - - postcode = home["Postcode"] - house_number = home["Number"] - full_address = home["Full Address"] - - searcher = SearchEpc( - address1=str(house_number), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - if searcher.newest_epc is None: - continue - - # Look for EPC recommendatons try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] - epc = { - "asset_list_address": full_address, - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"] - } + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None - epc_data.append(epc) + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "asset_list_address": full_address, + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(e) + time.sleep(5) epc_df = pd.DataFrame(epc_data) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8e1a7fdb..fc11f1c0 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -178,7 +178,7 @@ def is_condition_report(text): """ Determines if the provided text indicates that the PDF is a Condition Report. """ - return text.startswith("OsmosisACDNEWPAS2035ConditionReport") + return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport") def main(): From 8bf5b23410caccce29ddfaaf30953c1b48db4c7d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 20:29:31 +0000 Subject: [PATCH 25/59] handling extraction of windows data --- .../livewest/route_march_2024_10_28.py | 3 +- .../stonewater/Wave 3 Preparation.py | 58 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py index 47b86e89..c19c78b1 100644 --- a/etl/customers/livewest/route_march_2024_10_28.py +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -61,7 +61,8 @@ def app(): os_api_key="", property_type=None, fast=True, - full_address=full_address + full_address=full_address, + max_retries=3 ) # Force the skipping of estimating the EPC searcher.ordnance_survey_client.property_type = None diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fc11f1c0..a8e06416 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,6 +3,7 @@ import PyPDF2 import re import pandas as pd from tqdm import tqdm +from collections import Counter FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -19,6 +20,8 @@ def extract_summary_report(pdf_path): "Address": None, "Current SAP Rating": None, "Fuel Bill": None, + "Window Age Description": None, + "Window Age Description Proportion (%)": None, } with open(pdf_path, "rb") as file: @@ -61,9 +64,56 @@ def extract_summary_report(pdf_path): # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + return data +def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + + Parameters: + windows_text (str): The text section containing window data. + + Returns: + dict: A dictionary with the most common window age description and its proportion. + """ + # Clean up windows_text by removing line breaks for better pattern matching + windows_text = windows_text.replace("\n", "") + + # Define possible window age descriptions + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + + # Count occurrences of each description + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + # Determine the most common description and calculate its proportion + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion + } + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -74,6 +124,8 @@ def extract_epr(pdf_path): "Space Heating": None, "Water Heating": None, "Fuel Bill": None, + "Window Age Description": None, + "Window Age Description Proportion (%)": None, } with open(pdf_path, "rb") as file: @@ -115,6 +167,12 @@ def extract_epr(pdf_path): fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + # Extract the windows data + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + return data From e22baed16fcf6ce86e38266d557aab3cc529953d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 12:29:24 +0000 Subject: [PATCH 26/59] sorted livewest data pull --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../livewest/route_march_2024_10_28.py | 148 ++++++++++++------ .../stonewater/Wave 3 Preparation.py | 2 + 4 files changed, 102 insertions(+), 52 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..850c0cda 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..e4070118 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py index c19c78b1..1b259fba 100644 --- a/etl/customers/livewest/route_march_2024_10_28.py +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -19,6 +19,53 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + def app(): """ This app is EPC pulling data for some properties owned by Livewest @@ -45,56 +92,49 @@ def app(): asset_list = pd.read_excel( "/Users/khalimconn-kowlessar/Downloads/LIVEWEST 3578 ECO4 ECO PLUS GBIS.xlsx", header=0 ) + asset_list["row_id"] = asset_list.index - epc_data = [] - errors = [] - for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - try: - postcode = home["Postcode"] - house_number = home["Number"] - full_address = home["Full Address"] + epc_data, errors = get_data(asset_list) - searcher = SearchEpc( - address1=str(house_number), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=3 - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) - searcher.find_property(skip_os=True) - if searcher.newest_epc is None: - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - epc = { - "asset_list_address": full_address, - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"] - } - - epc_data.append(epc) - except Exception as e: - errors.append(e) - time.sleep(5) + # Append the failed data to the main data + epc_data.extend(epc_data_failed) epc_df = pd.DataFrame(epc_data) + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + # Retrieve just the data we need epc_df = epc_df[ [ - "asset_list_address", + "row_id", "uprn", "property-type", "built-form", @@ -110,7 +150,7 @@ def app(): "construction-age-band", "floor-height", "number-habitable-rooms", - "mainheat-description" + "mainheat-description", # "energy-consumption-current", # kwh/m2 ] @@ -119,11 +159,14 @@ def app(): asset_list = asset_list.merge( epc_df, how="left", - left_on=["ADDRESS"], - right_on=["asset_list_address"] + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" ) - asset_list = asset_list.drop(columns=["asset_list_address"]) + asset_list = asset_list.drop(columns=["row_id"]) # Rename the columns asset_list = asset_list.rename(columns={ @@ -140,14 +183,18 @@ def app(): "roof-description": "Roof Construction", "mainheat-description": "Heating Type", "secondheat-description": "Secondary Heating", - "transaction-type": "Reason for last EPC" + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" }) asset_list["Estimated Number of Floors"] = asset_list.apply( - lambda x: estimate_number_of_floors(property_type=x["Property Type"]), axis=1 + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 ) asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) asset_list["Estimated Perimeter (m)"] = asset_list.apply( @@ -157,7 +204,7 @@ def app(): ), axis=1 ) - asset_list["Estimated Heat Loss Perimeter (m)"] = asset_list.apply( + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( lambda x: estimate_external_wall_area( num_floors=x["Estimated Number of Floors"], floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, @@ -168,10 +215,11 @@ def app(): ) asset_list["Roof Insulation Thickness"] = asset_list.apply( - lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"], + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, axis=1 ) # Store as an excel - filename = "LHP EPC Data pull.xlsx" + filename = "livewest EPC Data pull - 29 Oct.xlsx" asset_list.to_excel(filename, index=False) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index a8e06416..d8d01b22 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -283,6 +283,8 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + # Save this as a csv + # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()] From b7f402ba9d699ede3693068f8bec9e2087c0a8aa Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 13:55:18 +0000 Subject: [PATCH 27/59] addded # Storeys --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/stonewater/Wave 3 Preparation.py | 11 +++++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 850c0cda..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index e4070118..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d8d01b22..b1b48cec 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -19,6 +19,7 @@ def extract_summary_report(pdf_path): data = { "Address": None, "Current SAP Rating": None, + "Number of Storeys": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -32,13 +33,15 @@ def extract_summary_report(pdf_path): # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) - if sap_match: - data["Current SAP Rating"] = sap_match.group(1) + data["Current SAP Rating"] = sap_match.group(1) + + # Number of storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + data["Number of Storeys"] = int(storeys_match.group(1)) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) - if fuel_bill_match: - data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" # Extract individual address components postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) From 753bda6cb0bc4c8de266944c04ab99db7d74da3d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:21:01 +0000 Subject: [PATCH 28/59] extracting heating systems from summary report --- .../stonewater/Wave 3 Preparation.py | 86 ++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b1b48cec..863a6a6c 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -19,10 +19,26 @@ def extract_summary_report(pdf_path): data = { "Address": None, "Current SAP Rating": None, - "Number of Storeys": None, + "Space Heating": None, + "Water Heating": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, + "Secondary Window Age Description": None, + "Secondary Window Age Description Proportion (%)": None, + "Number of Windows": None, + "Total Number of Doors": None, + "Number of Insulated Doors": None, + "Existing Primary Heating System": None, + "Existing Primary Heating PCDF Reference": None, + "Existing Primary Heating Controls": None, + "Existing Primary Heating % of Heat": None, + "Existing Secondary Heating System": None, + "Existing Secondary Heating PCDF Reference": None, + "Existing Secondary Heating Controls": None, + "Existing Secondary Heating % of Heat": None, + "Secondary Heating Code": None, + "Water Heating Code": None, } with open(pdf_path, "rb") as file: @@ -39,6 +55,10 @@ def extract_summary_report(pdf_path): storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) + # Extract Carbon Emissions + carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) + data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) + # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" @@ -66,12 +86,58 @@ def extract_summary_report(pdf_path): # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) + data["Postcode"] = postcode.group(1).strip() windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) windows_text = windows_section.group(1) window_data = extract_window_age_description(windows_text) data.update(window_data) + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Extract heating system + # Extract Primary Heating Data + # Extract Primary Heating Section + primary_heating_section = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + primary_text).group(1) + data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) + ) + + # Extract Secondary Heating Section + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + secondary_text = secondary_heating_section.group(1) + + data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( + 1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", + secondary_text).group(1).strip() + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + return data @@ -111,9 +177,20 @@ def extract_window_age_description(windows_text): most_common_description, window_count = description_counts.most_common(1)[0] window_proportion = window_count / sum(description_counts.values()) * 100 + # Get the second most common and the proportion + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + return { "Window Age Description": most_common_description, - "Window Age Description Proportion (%)": window_proportion + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) } @@ -129,6 +206,11 @@ def extract_epr(pdf_path): "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, + "Secondary Window Age Description": None, + "Secondary Window Age Description Proportion (%)": None, + "Number of Windows": None, + "Total Number of Doors": None, + "Number of Insulated Doors": None, } with open(pdf_path, "rb") as file: From 364b5b07e8f1ff29b3da3625014e4250fc5954ce Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:46:01 +0000 Subject: [PATCH 29/59] adding to extract eprs --- .../stonewater/Wave 3 Preparation.py | 101 +++++++++++++----- 1 file changed, 73 insertions(+), 28 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 863a6a6c..4ab33732 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -18,6 +18,7 @@ def extract_summary_report(pdf_path): """ data = { "Address": None, + "Postcode": None, "Current SAP Rating": None, "Space Heating": None, "Water Heating": None, @@ -200,7 +201,9 @@ def extract_epr(pdf_path): """ data = { "Address": None, + "Postcode": None, "Current SAP Rating": None, + "Potential SAP Rating": None, "Space Heating": None, "Water Heating": None, "Fuel Bill": None, @@ -211,6 +214,16 @@ def extract_epr(pdf_path): "Number of Windows": None, "Total Number of Doors": None, "Number of Insulated Doors": None, + "Existing Primary Heating System": None, + "Existing Primary Heating PCDF Reference": None, + "Existing Primary Heating Controls": None, + "Existing Primary Heating % of Heat": None, + "Existing Secondary Heating System": None, + "Existing Secondary Heating PCDF Reference": None, + "Existing Secondary Heating Controls": None, + "Existing Secondary Heating % of Heat": None, + "Secondary Heating Code": None, + "Water Heating Code": None, } with open(pdf_path, "rb") as file: @@ -222,41 +235,73 @@ def extract_epr(pdf_path): # Extract Address address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) data["Address"] = address_match.group(1).strip() + data["Postcode"] = data["Address"].split(",")[-1].strip() - # Extract Total Floor Area - # area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) - # data["Total Floor Area"] = area_match.group(1) - - # Extract Current SAP rating - # Updated Regular Expression to find "GG (1-20)" followed by two numbers + # Extract Current and Potential SAP ratings sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text) + current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) + data["Current SAP Rating"] = current_sap - # Extract and validate the Current and Potential SAP ratings - current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) - # Ensure potential is greater than or equal to current - if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: - data["Current SAP Rating"] = current_sap - data["Potential SAP Rating"] = potential_sap - else: - raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") - - # Extract Space Heating (kWh) - space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text) - data["Space Heating"] = int(space_heating_match.group(1)) - - # Extract Water Heating (kWh) - water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text) - data["Water Heating"] = int(water_heating_match.group(1)) - - # Extract Fuel Bill (total estimated costs) + # Extract Fuel Bill fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" - # Extract the windows data + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Extract Primary Heating Section (Main Heating 1) + primary_heating_section = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + primary_text).group(1) + data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) + ) + + # Extract Secondary Heating Section (Main Heating 2) + secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + secondary_text = secondary_heating_section.group(1) + + data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( + 1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + + if data["Existing Secondary Heating System"] == "": + data["Existing Secondary Heating Controls"] = "" + else: + data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", + secondary_text).group(1).strip() + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + # Extract Windows information windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) - windows_text = windows_section.group(1) - window_data = extract_window_age_description(windows_text) - data.update(window_data) + if windows_section: + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) return data From 9eb4720c91d22ed2084364d92a0c99cbb3088adc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:54:19 +0000 Subject: [PATCH 30/59] added peui --- etl/customers/stonewater/Wave 3 Preparation.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 4ab33732..1b7b1bcd 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -20,9 +20,8 @@ def extract_summary_report(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, - "Space Heating": None, - "Water Heating": None, "Fuel Bill": None, + "Number of Storeys": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, "Secondary Window Age Description": None, @@ -203,9 +202,8 @@ def extract_epr(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, - "Potential SAP Rating": None, - "Space Heating": None, - "Water Heating": None, + "Primary Energy Use Intensity (kWh/m2/yr)": None, + "Number of Storeys": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -242,6 +240,14 @@ def extract_epr(pdf_path): current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) data["Current SAP Rating"] = current_sap + # Extract the primary energy use intensity + additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + + # Extract Number of Storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + data["Number of Storeys"] = int(storeys_match.group(1)) + # Extract Fuel Bill fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" From b74b8823d18d428888fd832c515cc81cb2c6bdf1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:59:32 +0000 Subject: [PATCH 31/59] fixing bug extracting from epr --- .../stonewater/Wave 3 Preparation.py | 54 ++++++++++++------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 1b7b1bcd..02a5cd83 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -261,36 +261,50 @@ def extract_epr(pdf_path): data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) # Extract Primary Heating Section (Main Heating 1) - primary_heating_section = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + # We may not have a secondary heating + primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 primary_text = primary_heating_section.group(1) - data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( - 1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - primary_text).group(1) - data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( - 1).strip() + data["Existing Primary Heating System"] = re.search( + r"Main Heating Code\s*(.*?)\n", primary_text + ).group(1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1) + data["Existing Primary Heating Controls"] = re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip() data["Existing Primary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) ) # Extract Secondary Heating Section (Main Heating 2) secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) - secondary_text = secondary_heating_section.group(1) - - data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( - 1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) - - if data["Existing Secondary Heating System"] == "": + if secondary_heating_section is None: + data["Existing Secondary Heating System"] = "" + data["Existing Secondary Heating PCDF Reference"] = "" data["Existing Secondary Heating Controls"] = "" + data["Existing Secondary Heating % of Heat"] = 0 + else: - data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", - secondary_text).group(1).strip() - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) - ) + secondary_text = secondary_heating_section.group(1) + + data["Existing Secondary Heating System"] = re.search( + r"Main Heating Code\s*(.*?)\n", secondary_text + ).group(1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + + if data["Existing Secondary Heating System"] == "": + data["Existing Secondary Heating Controls"] = "" + else: + data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", + secondary_text).group(1).strip() + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) + ) # Extract Secondary Heating and Water Heating Codes secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) From 9e752fca8db65d829cdac4ff15fc874fd086ad6d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 15:03:23 +0000 Subject: [PATCH 32/59] handling edge case extracting from summary report --- etl/customers/stonewater/Wave 3 Preparation.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 02a5cd83..0af43310 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -109,10 +109,12 @@ def extract_summary_report(pdf_path): data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( 1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - primary_text).group(1) - data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( - 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1) + data["Existing Primary Heating Controls"] = re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip() data["Existing Primary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) ) @@ -125,8 +127,10 @@ def extract_summary_report(pdf_path): 1).strip() data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) - data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", - secondary_text).group(1).strip() + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) data["Existing Secondary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) ) @@ -287,7 +291,7 @@ def extract_epr(pdf_path): data["Existing Secondary Heating PCDF Reference"] = "" data["Existing Secondary Heating Controls"] = "" data["Existing Secondary Heating % of Heat"] = 0 - + else: secondary_text = secondary_heating_section.group(1) From a9ce5b68bb6b506b62179c7abac5f43da2498ad1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 15:11:16 +0000 Subject: [PATCH 33/59] debug extract of main heating code --- etl/customers/stonewater/Wave 3 Preparation.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0af43310..bb100ae1 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -123,8 +123,8 @@ def extract_summary_report(pdf_path): secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) secondary_text = secondary_heating_section.group(1) - data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( - 1).strip() + main_heating_code_match = re.search(r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text) + data["Existing Secondary Heating System"] = main_heating_code_match.group(1).strip() data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) @@ -139,7 +139,11 @@ def extract_summary_report(pdf_path): secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data From 48369ae1505a769339f7adaf713d809e0bfdd208 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 15:18:11 +0000 Subject: [PATCH 34/59] refactor to prioritise epc --- .../stonewater/Wave 3 Preparation.py | 66 +++++++++++++++---- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bb100ae1..7f4f81e9 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -123,8 +123,10 @@ def extract_summary_report(pdf_path): secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) secondary_text = secondary_heating_section.group(1) - main_heating_code_match = re.search(r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text) - data["Existing Secondary Heating System"] = main_heating_code_match.group(1).strip() + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) @@ -299,11 +301,14 @@ def extract_epr(pdf_path): else: secondary_text = secondary_heating_section.group(1) - data["Existing Secondary Heating System"] = re.search( - r"Main Heating Code\s*(.*?)\n", secondary_text - ).group(1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() + + data["Existing Secondary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", secondary_text + ).group(1) if data["Existing Secondary Heating System"] == "": data["Existing Secondary Heating Controls"] = "" @@ -334,20 +339,57 @@ def extract_epr(pdf_path): return data +def detect_report_type(pdf_path, pdf_file): + """ + Detects the type of report based on content or filename. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: String type of the report ("epr", "summary", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" + + if is_energy_report(first_page_text): + return "epr" + elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): + return "summary" + elif is_condition_report(first_page_text): + return "condition" + + return None + + def extract_retrofit_pdfs(data_folder_path): """ Handles extraction from a retrofit data folder if it exists and has content. + Prioritizes extracting data from an EPR if both EPR and summary report are present. """ retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")] + report_types = {"epr": None, "summary": None} + # First, identify the types of reports available for pdf_file in retrofit_files: pdf_path = os.path.join(data_folder_path, pdf_file) - extracted = detect_and_parse_report(pdf_path, pdf_file) - if extracted is not None: - return extracted - continue + report_type = detect_report_type(pdf_path, pdf_file) - # If no relevant PDF is found, exit + if report_type == "epr": + report_types["epr"] = pdf_path + elif report_type == "summary": + report_types["summary"] = pdf_path + + # Stop checking further if both EPR and summary are found + if report_types["epr"] and report_types["summary"]: + break + + # Extract data based on report availability and priority + if report_types["epr"]: + return extract_epr(report_types["epr"]) + elif report_types["summary"]: + return extract_summary_report(report_types["summary"]) + + # If no relevant PDF is found, return None return None From 5af1836aa7731613ed58437586ca7e592a66150a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 16:32:25 +0000 Subject: [PATCH 35/59] extracting dimensions from epr --- .../stonewater/Wave 3 Preparation.py | 82 ++++++++++++++++++- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7f4f81e9..0b660c76 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -16,6 +16,7 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ + blah data = { "Address": None, "Postcode": None, @@ -56,8 +57,8 @@ def extract_summary_report(pdf_path): data["Number of Storeys"] = int(storeys_match.group(1)) # Extract Carbon Emissions - carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) - data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) + # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) + # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) @@ -204,6 +205,69 @@ def extract_window_age_description(windows_text): } +def extract_building_parts_epr(text): + """ + Extracts building parts and associated dimensions from the provided PDF file. + Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length. + """ + data = [] + + # Pattern to locate each "Building part" section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party " + r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)", + re.DOTALL + ) + + # Extract each building part + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + # Clean up building part name to keep only the descriptor (e.g., "Main" or "1st Extension") + cleaned_part_name = re.sub(r" - built in.*", "", part_name) + + floor_data = match.group(2) + + # Pattern to match each floor's measurements + floor_pattern = re.compile( + r"(Lowest floor|First floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract floor details for each building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + # We now extract out the aggregated data + + main_building = [part for part in data if "Main" in part["Building Part"]] + first_extension = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] + ), + "RIR Floor Area": 0, + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension]) if first_extension else 0, + } + + return dimensions + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -212,6 +276,7 @@ def extract_epr(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, "Fuel Bill": None, @@ -232,6 +297,11 @@ def extract_epr(pdf_path): "Existing Secondary Heating % of Heat": None, "Secondary Heating Code": None, "Water Heating Code": None, + 'Total Floor Area (m2)': None, + 'Total Ground Floor Area': None, + 'RIR Floor Area': None, + 'Main Building Wall Area (m2)': None, + 'First Extension Wall Area (m2)': None } with open(pdf_path, "rb") as file: @@ -336,6 +406,9 @@ def extract_epr(pdf_path): window_data = extract_window_age_description(windows_text) data.update(window_data) + building_parts = extract_building_parts_epr(text) + data.update(building_parts) + return data @@ -465,7 +538,7 @@ def main(): if summary_data: summary_data = { "survey_folder": survey_folder, - **summary_data + **summary_data, } extracted_data.append(summary_data) continue @@ -474,11 +547,12 @@ def main(): continue # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, - **summary_data + **summary_data, } extracted_data.append(summary_data) From 4e752fb6c48cb163e4350f32eceb14f5a97d2a94 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:00:02 +0000 Subject: [PATCH 36/59] added summary table dimension extraction --- .../stonewater/Wave 3 Preparation.py | 82 ++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0b660c76..b660ab64 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -16,7 +16,6 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ - blah data = { "Address": None, "Postcode": None, @@ -40,6 +39,11 @@ def extract_summary_report(pdf_path): "Existing Secondary Heating % of Heat": None, "Secondary Heating Code": None, "Water Heating Code": None, + 'Total Floor Area (m2)': None, + 'Total Ground Floor Area (m2)': None, + 'RIR Floor Area': None, + 'Main Building Wall Area (m2)': None, + 'First Extension Wall Area (m2)': None } with open(pdf_path, "rb") as file: @@ -149,6 +153,9 @@ def extract_summary_report(pdf_path): data["Water Heating Code"] = water_heating_code_match.group(1).strip() + dimensions = extract_building_parts_summary(text) + data.update(dimensions) + return data @@ -256,7 +263,7 @@ def extract_building_parts_epr(text): first_extension = [part for part in data if "1st Extension" in part["Building Part"]] dimensions = { "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), - "Total Ground Floor Area": sum( + "Total Ground Floor Area (m2)": sum( [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] ), "RIR Floor Area": 0, @@ -268,6 +275,75 @@ def extract_building_parts_epr(text): return dimensions +def extract_building_parts_summary(text): + """ + Extracts building parts and associated dimensions from the summary report PDF. + This includes Main Property and multiple extensions if they exist. + """ + data = [] + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to extract each building part, starting from Main Property and including extensions + building_part_pattern = re.compile( + r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory)", + re.DOTALL + ) + + # Loop through each building part match, including Main Property and extensions + for match in building_part_pattern.finditer(dimensions_text): + part_name = match.group(1) + floor_data = match.group(2) + + # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length + floor_pattern = re.compile( + r"(1st Floor|Lowest Floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract data for each floor within the building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data list + data.append({ + "Building Part": part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + # Calculate aggregated dimensions + main_property = [part for part in data if "Main Property" in part["Building Part"]] + first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area (m2)": sum( + [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] + ), + "RIR Floor Area": 0, + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions] + ), + } + + return dimensions + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -298,7 +374,7 @@ def extract_epr(pdf_path): "Secondary Heating Code": None, "Water Heating Code": None, 'Total Floor Area (m2)': None, - 'Total Ground Floor Area': None, + 'Total Ground Floor Area (m2)': None, 'RIR Floor Area': None, 'Main Building Wall Area (m2)': None, 'First Extension Wall Area (m2)': None From a30ad1762a37c81c326412c43cfaa5c91f721ad0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:05:37 +0000 Subject: [PATCH 37/59] handled problem case for summary dimensions --- etl/customers/stonewater/Wave 3 Preparation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b660ab64..1973cbd8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -294,7 +294,7 @@ def extract_building_parts_summary(text): # Pattern to extract each building part, starting from Main Property and including extensions building_part_pattern = re.compile( r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" - r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory)", + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)", re.DOTALL ) From 98ae672a6160d84e099125904dac390eda1f6fa2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:24:16 +0000 Subject: [PATCH 38/59] debuggin secondary heating code --- etl/customers/stonewater/Wave 3 Preparation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 1973cbd8..84d67f56 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -472,7 +472,8 @@ def extract_epr(pdf_path): if data["Existing Secondary Heating System"] == "": data["Secondary Heating Code"] = "" else: - data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" data["Water Heating Code"] = water_heating_code_match.group(1).strip() # Extract Windows information From d8e8b997a46bf278154cea08444f9b8add3386c5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:31:23 +0000 Subject: [PATCH 39/59] extend to get dimensions from 2nd floor --- etl/customers/stonewater/Wave 3 Preparation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 84d67f56..ad35e2d5 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -149,7 +149,8 @@ def extract_summary_report(pdf_path): if data["Existing Secondary Heating System"] == "": data["Secondary Heating Code"] = "" else: - data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" data["Water Heating Code"] = water_heating_code_match.group(1).strip() @@ -236,7 +237,7 @@ def extract_building_parts_epr(text): # Pattern to match each floor's measurements floor_pattern = re.compile( - r"(Lowest floor|First floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" ) # Extract floor details for each building part @@ -305,7 +306,7 @@ def extract_building_parts_summary(text): # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length floor_pattern = re.compile( - r"(1st Floor|Lowest Floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" ) # Extract data for each floor within the building part @@ -634,6 +635,7 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) From c0d896cd59dc3ba003024da9c1caf81737b28d55 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:35:57 +0000 Subject: [PATCH 40/59] Debugging secondary heating extraction --- etl/customers/stonewater/Wave 3 Preparation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ad35e2d5..dc01ef6f 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -460,8 +460,11 @@ def extract_epr(pdf_path): if data["Existing Secondary Heating System"] == "": data["Existing Secondary Heating Controls"] = "" else: - data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", - secondary_text).group(1).strip() + # Might not have heating controls on 2nd system + secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + secondary_controls_match.group(1).strip() if secondary_controls_match else "" + ) data["Existing Secondary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) ) From 4160ec4dcbae01b438010cc75e0d6eb157d76df2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:52:51 +0000 Subject: [PATCH 41/59] debugging missing secondary heating for summary report, completed extraction for files --- .../stonewater/Wave 3 Preparation.py | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index dc01ef6f..7bedef29 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -109,7 +109,10 @@ def extract_summary_report(pdf_path): # Extract heating system # Extract Primary Heating Data # Extract Primary Heating Section - primary_heating_section = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + primary_text = primary_heating_section.group(1) data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( @@ -126,21 +129,29 @@ def extract_summary_report(pdf_path): # Extract Secondary Heating Section secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - secondary_text = secondary_heating_section.group(1) - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text - ) - data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) - second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - data["Existing Secondary Heating Controls"] = ( - second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" - ) - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) - ) + if secondary_heating_section is None: + data["Existing Secondary Heating System"] = "" + data["Existing Secondary Heating PCDF Reference"] = "" + data["Existing Secondary Heating Controls"] = "" + data["Existing Secondary Heating % of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) # Extract Secondary Heating and Water Heating Codes secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) @@ -638,6 +649,9 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + extracted_data["Primary Energy Use (kWh/yr)"] = ( + extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] + ) # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) From dbee05e555d758d464efe2a43c18d6c3b017cef8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 18:37:47 +0000 Subject: [PATCH 42/59] working on matching lookup --- .../stonewater/Wave 3 Preparation.py | 48 ++++++++++++++++++- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7bedef29..d90360aa 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -5,7 +5,8 @@ import pandas as pd from tqdm import tqdm from collections import Counter -FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" +CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" +FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys") def extract_summary_report(pdf_path): @@ -653,6 +654,51 @@ def main(): extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + # We now merge on the coordinator data so that against each property, we can map the measures + retrofit_packages_board = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"), + header=4 + ) + retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # We now match this retrofit packages board to the extracted data + matching_lookup = [] + for _, home in retrofit_packages_board.iterrows(): + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()] + if filtered.empty: + print("Check this once we have full data") + continue + + if filtered.shape[0] == 1: + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Osm. ID": home["Osm. ID"], + "Name": home["Name"] + } + ) + continue + + # home["Name"] should be contained in the survey_folder + filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # We have an edge case wher some properties have two outputs in Sharepoint + if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + + if filtered.empty: + raise Exception("somethign went wrong") + if filtered.shape[0] != 1: + raise Exception("somethign went wrong2") + + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Osm. ID": home["Osm. ID"], + "Name": home["Name"] + } + ) + + matching_lookup = pd.DataFrame(matching_lookup) + # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 70bec3cc..97314b32 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1,3 +1,4 @@ PyPDF2 pandas tqdm +openpyxl From 791262fa866e420cef6a2eced9b4f4ec28897409 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 09:29:11 +0000 Subject: [PATCH 43/59] adding all surveys and updating creation of filepaths --- .../stonewater/Wave 3 Preparation.py | 124 +++++++++++++++++- 1 file changed, 117 insertions(+), 7 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index d90360aa..fe1faa9d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2,11 +2,13 @@ import os import PyPDF2 import re import pandas as pd +import numpy as np from tqdm import tqdm from collections import Counter CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" -FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys") +SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") +NUM_FOLDERS = 14 def extract_summary_report(pdf_path): @@ -610,11 +612,18 @@ def main(): This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. """ # List only directories in the specified FILE_PATH - survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list extracted_data = [] for survey_folder in tqdm(survey_folders): - survey_folder_path = os.path.join(FILE_PATH, survey_folder) + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) # List the folders inside of the survey folder survey_subfolders = [name for name in os.listdir(survey_folder_path) @@ -623,9 +632,17 @@ def main(): # Check if there's a "retrofit assessment" folder retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + # If retrofit assessment folder exists, check if it has content - if retrofit_folder: - retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) if os.listdir(retrofit_folder_path): # If not empty summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) if summary_data: @@ -642,6 +659,11 @@ def main(): # If no retrofit folder or it was empty, check files in survey_folder summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if summary_data: summary_data = { "survey_folder": survey_folder, @@ -650,9 +672,14 @@ def main(): extracted_data.append(summary_data) extracted_data = pd.DataFrame(extracted_data) + + # What was missed??? + extracted_data["Primary Energy Use (kWh/yr)"] = ( extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + # TODO: Clean up SAP and extract EPC + # TODO: RIR floor area!!! # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( @@ -663,7 +690,13 @@ def main(): # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in retrofit_packages_board.iterrows(): - filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()] + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", ""), case=False + )] + if filtered.empty: print("Check this once we have full data") continue @@ -684,8 +717,12 @@ def main(): if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + if filtered.empty: - raise Exception("somethign went wrong") + print("Check this once we have full data2!!!") + continue if filtered.shape[0] != 1: raise Exception("somethign went wrong2") @@ -699,6 +736,79 @@ def main(): matching_lookup = pd.DataFrame(matching_lookup) + if matching_lookup["Osm. ID"].duplicated().sum(): + raise Exception("Duplicate Osm. IDs") + + if matching_lookup["survey_folder"].duplicated().sum(): + raise Exception("Duplicate survey folders") + + measure_columns = [ + 'Main Wall Insulation', + 'Secondary Wall Insulation', + 'Loft insulation', + 'Flat Roof', + 'Room in Roof', + 'Window Upgrade', + 'Door Upgrade', + 'Ventilation', + 'Main Heating', + 'Water Heating', + 'Heating Controls', + 'Solar PV', + 'Other measures' + ] + + # We should end up with a 1:1 mapping between the Osm. ID and the survey folder + stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge( + retrofit_packages_board[ + [ + "Name", + "Osm. ID", + "Address ID", + "Archetype ID", + "Arch. Group Rank", "Archetype Representative", + "Actual SAP Band", + "Actual SAP Rating", + "Modelled SAP Band", + "Modelled SAP Rating", + ] + measure_columns + ], + on=["Osm. ID", "Name"], + how="left" + ) + + # We've appended the recommended packages and modelled SAP ratings to the data + # We also want to append the windows data + windows_data = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx" + ), + header=12 + ) + + # We get a lookup id of Osm.ID and when the windows were fitted + windows_data = windows_data[ + ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"] + ] + # Convert to string for the moment + windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[ + "Parent Asset Window attributes - Fitted/renewed date" + ].astype(str) + # Create a single date column + windows_data["Fitted/renewed date"] = np.where( + pd.notnull(windows_data["Window attributes - Fitted/renewed date"]), + windows_data["Window attributes - Fitted/renewed date"], + windows_data["Parent Asset Window attributes - Fitted/renewed date"] + ) + # Convert to a date + windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"]) + # Calculate the number of years since something was done on the windows + windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[ + "Fitted/renewed date"]).dt.days / 365 + + # TODO: Flag if a package includes windows + # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) From 8983ebec2fd9ea593f19990f5c02847da4adbc45 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 10:03:10 +0000 Subject: [PATCH 44/59] adding epc band --- .../stonewater/Wave 3 Preparation.py | 59 ++++++++++++++++++- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fe1faa9d..2654fae5 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -11,6 +11,32 @@ SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") NUM_FOLDERS = 14 +def sap_to_epc(sap_points: int | float): + """ + Simple utility function to convert SAP points to EPC rating. + :param sap_points: numerical value of SAP points, typically between 0 and 100 + :return: + """ + + if sap_points <= 0: + raise ValueError("SAP points should be above 0.") + + if sap_points >= 92: + return "A" + elif sap_points >= 81: + return "B" + elif sap_points >= 69: + return "C" + elif sap_points >= 55: + return "D" + elif sap_points >= 39: + return "E" + elif sap_points >= 21: + return "F" + else: + return "G" + + def extract_summary_report(pdf_path): """ Extracts specific data from the provided PDF file. @@ -23,6 +49,7 @@ def extract_summary_report(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Current EPC Band": None, "Fuel Bill": None, "Number of Storeys": None, "Window Age Description": None, @@ -57,7 +84,7 @@ def extract_summary_report(pdf_path): # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) - data["Current SAP Rating"] = sap_match.group(1) + data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] # Number of storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) @@ -367,6 +394,7 @@ def extract_epr(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Current EPC Band": None, "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, @@ -621,6 +649,9 @@ def main(): folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] survey_folders.extend(folder_contents) # Append contents to the master list + # Get rid of .DS_Store files + survey_folders = [folder for folder in survey_folders if not folder.endswith(".DS_Store")] + extracted_data = [] for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) @@ -643,6 +674,16 @@ def main(): retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) else: retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + if os.listdir(retrofit_folder_path): # If not empty summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) if summary_data: @@ -673,14 +714,24 @@ def main(): extracted_data = pd.DataFrame(extracted_data) - # What was missed??? - extracted_data["Primary Energy Use (kWh/yr)"] = ( extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) + extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) + # TODO: Clean up SAP and extract EPC # TODO: RIR floor area!!! + # Remove some definite duplicates + extracted_data = extracted_data[ + ~extracted_data["survey_folder"].isin( + [ + "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", + ] + ) + ] + # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"), @@ -715,9 +766,11 @@ def main(): filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] # We have an edge case wher some properties have two outputs in Sharepoint if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + bl1h2 filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + blah1 filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] if filtered.empty: From cb9399a704bcf2605429bc18704c0ff2b413d406 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 10:22:23 +0000 Subject: [PATCH 45/59] investigating missings' --- .../stonewater/Wave 3 Preparation.py | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 2654fae5..53279eed 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -720,15 +720,22 @@ def main(): extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) - # TODO: Clean up SAP and extract EPC # TODO: RIR floor area!!! # Remove some definite duplicates + dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"] + dupes = extracted_data[extracted_data["Address"].isin(dupes)] + dupes = dupes.sort_values("Address") + # Get all of the folders that end with ROSS + to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + extracted_data = extracted_data[ ~extracted_data["survey_folder"].isin( [ "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", - ] + "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", + "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" + ] + to_drop ) ] @@ -740,8 +747,15 @@ def main(): retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] # We now match this retrofit packages board to the extracted data matching_lookup = [] - for _, home in retrofit_packages_board.iterrows(): - filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() + for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] == "Flat 21 Walmer Street": + filtered = extracted_data[ + extracted_data["survey_folder"] == "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD" + ].copy() + else: + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( @@ -749,7 +763,6 @@ def main(): )] if filtered.empty: - print("Check this once we have full data") continue if filtered.shape[0] == 1: @@ -766,18 +779,20 @@ def main(): filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] # We have an edge case wher some properties have two outputs in Sharepoint if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": - bl1h2 - filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + raise Exception("Fix me1") + # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': - blah1 - filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + raise Exception("Fix me2") + # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + + if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] if filtered.empty: - print("Check this once we have full data2!!!") continue if filtered.shape[0] != 1: - raise Exception("somethign went wrong2") + raise Exception("something went wrong") matching_lookup.append( { @@ -788,6 +803,9 @@ def main(): ) matching_lookup = pd.DataFrame(matching_lookup) + # Find Osmosis IDs that are in the packages board but not in the matching looking + # missing_osm_ids = set(retrofit_packages_board["Osm. ID"]) - set(matching_lookup["Osm. ID"]) + # missing_osm_ids = list(missing_osm_ids) if matching_lookup["Osm. ID"].duplicated().sum(): raise Exception("Duplicate Osm. IDs") From 51c2d04a6d0d919a07edac2d34e868a59c755b2d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 11:42:40 +0000 Subject: [PATCH 46/59] fixing missed matches --- .../stonewater/Wave 3 Preparation.py | 80 ++++++++++++++----- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 53279eed..5e444ca8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -741,26 +741,53 @@ def main(): # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( - os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"), + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater 3.0 Updated SAP Pre & Modelled 29.10.24.xlsx"), header=4 ) retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # Take just the rows that have been surveyed + retrofit_packages_board = retrofit_packages_board[ + retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) + ] + + # Replace \n with "" + extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "") + + manual_filters = { + "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", + "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", + "1 Cluny Way": "12-1-1 Cluny Way-SG15 6ZB", + "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", + 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", + '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', + '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', + 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', + 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + } + # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): - # Handle the case that has the wrong postcode in the asset data - if home["Name"] == "Flat 21 Walmer Street": - filtered = extracted_data[ - extracted_data["survey_folder"] == "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD" - ].copy() + if home["Address ID"] == 6111566: + blah + # 6118117, 6118744, 6117091 + if home["Name"] in manual_filters: + filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() else: filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() + filtered["survey_folder"].values - # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces - filtered = filtered[filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( - home["Name"].replace(r"[^\w\s]", ""), case=False - )] + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", + "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + filtered = filtered[to_filter] if filtered.empty: continue @@ -769,7 +796,7 @@ def main(): matching_lookup.append( { "survey_folder": filtered["survey_folder"].values[0], - "Osm. ID": home["Osm. ID"], + "Address ID": home["Address ID"], "Name": home["Name"] } ) @@ -797,15 +824,23 @@ def main(): matching_lookup.append( { "survey_folder": filtered["survey_folder"].values[0], - "Osm. ID": home["Osm. ID"], + "Address ID": home["Address ID"], "Name": home["Name"] } ) matching_lookup = pd.DataFrame(matching_lookup) # Find Osmosis IDs that are in the packages board but not in the matching looking - # missing_osm_ids = set(retrofit_packages_board["Osm. ID"]) - set(matching_lookup["Osm. ID"]) - # missing_osm_ids = list(missing_osm_ids) + missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"]) + missing_ids = list(missing_ids) + print(len(missing_ids)) + if missing_ids: + # We check that the missing ids have no data yet + missing_data = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)][ + ["Name", "Address ID", "Archetype ID"]] + extracted_data[extracted_data["survey_folder"].str.contains("23 Monmouth")]["survey_folder"].values + + matching_lookup[matching_lookup["survey_folder"].str.contains("23 Monmouth")] if matching_lookup["Osm. ID"].duplicated().sum(): raise Exception("Duplicate Osm. IDs") @@ -834,7 +869,6 @@ def main(): retrofit_packages_board[ [ "Name", - "Osm. ID", "Address ID", "Archetype ID", "Arch. Group Rank", "Archetype Representative", @@ -848,6 +882,14 @@ def main(): how="left" ) + # Create a section for costs + for measure in measure_columns: + stonewater_data[f"Cost of {measure}"] = None + + stonewater_data["Total Cost of Measures"] = None + stonewater_data["Contingency Cost"] = None + stonewater_data["Total Cost of Measures inc Contingency"] = None + # We've appended the recommended packages and modelled SAP ratings to the data # We also want to append the windows data windows_data = pd.read_excel( @@ -878,12 +920,8 @@ def main(): windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[ "Fitted/renewed date"]).dt.days / 365 - # TODO: Flag if a package includes windows - - # Save this as a csv - # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) - - missed = [f for f in survey_folders if f not in extracted_data["survey_folder"].tolist()] + stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"]) + stonewater_data = stonewater_data.merge(windows_data, on="Osm. ID", how="left") # if __name__ == "__main__": # main() From 90c9466421b5cb187c9355d0a8c005f379650ece Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 13:46:43 +0000 Subject: [PATCH 47/59] sorted dupes --- .../stonewater/Wave 3 Preparation.py | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 5e444ca8..67362865 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -756,27 +756,34 @@ def main(): manual_filters = { "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", - "1 Cluny Way": "12-1-1 Cluny Way-SG15 6ZB", "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', + '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', + '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', + # '2 Sorrell Place': '', + # '72 St Ives Road': '', + # '1 The Close, Burton Gardens': '', + # '102 Cheaton Close': '', + # 'Flat 16 Spring Gardens': '', + # '4 Apple Close': '', + '25 Folly Lane': '', + } # We now match this retrofit packages board to the extracted data matching_lookup = [] for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): # Handle the case that has the wrong postcode in the asset data - if home["Address ID"] == 6111566: - blah - # 6118117, 6118744, 6117091 if home["Name"] in manual_filters: filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() else: filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() - filtered["survey_folder"].values # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( @@ -836,14 +843,11 @@ def main(): print(len(missing_ids)) if missing_ids: # We check that the missing ids have no data yet - missing_data = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)][ - ["Name", "Address ID", "Archetype ID"]] - extracted_data[extracted_data["survey_folder"].str.contains("23 Monmouth")]["survey_folder"].values + if len(missing_ids) != 8: + raise Exception("Unacceptable number of missings") - matching_lookup[matching_lookup["survey_folder"].str.contains("23 Monmouth")] - - if matching_lookup["Osm. ID"].duplicated().sum(): - raise Exception("Duplicate Osm. IDs") + if matching_lookup["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") if matching_lookup["survey_folder"].duplicated().sum(): raise Exception("Duplicate survey folders") @@ -865,20 +869,21 @@ def main(): ] # We should end up with a 1:1 mapping between the Osm. ID and the survey folder - stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="left").merge( + stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="inner").merge( retrofit_packages_board[ [ "Name", + "RA", "Address ID", "Archetype ID", - "Arch. Group Rank", "Archetype Representative", + "Arch. Group Rank", "Actual SAP Band", "Actual SAP Rating", "Modelled SAP Band", "Modelled SAP Rating", ] + measure_columns ], - on=["Osm. ID", "Name"], + on=["Address ID", "Name"], how="left" ) @@ -900,9 +905,13 @@ def main(): header=12 ) + windows_data = windows_data[windows_data["Address ID"] != "Address ID"] + windows_data = windows_data[~pd.isnull(windows_data["Address ID"])] + # We get a lookup id of Osm.ID and when the windows were fitted windows_data = windows_data[ - ["Osm. ID", "Window attributes - Fitted/renewed date", "Parent Asset Window attributes - Fitted/renewed date"] + ["Address ID", "Window attributes - Fitted/renewed date", + "Parent Asset Window attributes - Fitted/renewed date"] ] # Convert to string for the moment windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[ @@ -921,7 +930,8 @@ def main(): "Fitted/renewed date"]).dt.days / 365 stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"]) - stonewater_data = stonewater_data.merge(windows_data, on="Osm. ID", how="left") + windows_data["Address ID"] = windows_data["Address ID"].astype(float) + stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left") # if __name__ == "__main__": # main() From fba5b2b3cbe786dd7d16b1380fe59f9ff6447206 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 13:58:36 +0000 Subject: [PATCH 48/59] added RIR detection to summary report --- .../stonewater/Wave 3 Preparation.py | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 67362865..6cf26df8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -320,7 +320,7 @@ def extract_building_parts_epr(text): def extract_building_parts_summary(text): """ Extracts building parts and associated dimensions from the summary report PDF. - This includes Main Property and multiple extensions if they exist. + This includes Main Property, multiple extensions if they exist, and Room in Roof areas. """ data = [] @@ -368,6 +368,20 @@ def extract_building_parts_summary(text): "Party Wall Length (m)": party_wall_length }) + # Check specifically for "Room(s) in Roof" entries, which only have Floor Area + room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)") + room_in_roof_match = room_in_roof_pattern.search(floor_data) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + data.append({ + "Building Part": part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + }) + # Calculate aggregated dimensions main_property = [part for part in data if "Main Property" in part["Building Part"]] first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] @@ -376,10 +390,14 @@ def extract_building_parts_summary(text): "Total Ground Floor Area (m2)": sum( [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] ), - "RIR Floor Area": 0, - "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property]), + "RIR Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] + ), + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if + x["Perimeter (m)"] and x["Room Height (m)"]]), "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions] + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if + x["Perimeter (m)"] and x["Room Height (m)"]] ), } @@ -887,6 +905,9 @@ def main(): how="left" ) + if stonewater_data["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") + # Create a section for costs for measure in measure_columns: stonewater_data[f"Cost of {measure}"] = None @@ -933,5 +954,10 @@ def main(): windows_data["Address ID"] = windows_data["Address ID"].astype(float) stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left") + if stonewater_data["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") + + # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values + # if __name__ == "__main__": # main() From d0cf88af6498d73a1155af320e5d6b899e3f94fa Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 14:09:42 +0000 Subject: [PATCH 49/59] added RIR area search for epr --- .../stonewater/Wave 3 Preparation.py | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 6cf26df8..ee5cd1ca 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -256,8 +256,9 @@ def extract_window_age_description(windows_text): def extract_building_parts_epr(text): """ - Extracts building parts and associated dimensions from the provided PDF file. + Extracts building parts and associated dimensions from the provided PDF text. Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length. + Handles cases where 'Room(s) in Roof area' appears within the part_name with only the Floor Area information. """ data = [] @@ -271,12 +272,28 @@ def extract_building_parts_epr(text): # Extract each building part for match in building_part_pattern.finditer(text): part_name = match.group(1).strip() - # Clean up building part name to keep only the descriptor (e.g., "Main" or "1st Extension") - cleaned_part_name = re.sub(r" - built in.*", "", part_name) - floor_data = match.group(2) - # Pattern to match each floor's measurements + # Check for "Room(s) in Roof area" within the part_name + room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name) + if room_in_roof_match: + # Extract Room in Roof area and add it as a separate entry + floor_area = float(room_in_roof_match.group(1)) + # Clean up part name to exclude "Room(s) in Roof area" from the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + }) + else: + # Clean up part name to keep only the descriptor (e.g., "Main" or "1st Extension") + cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip() + + # Pattern to match each floor's measurements in standard cases floor_pattern = re.compile( r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" ) @@ -299,8 +316,7 @@ def extract_building_parts_epr(text): "Party Wall Length (m)": party_wall_length }) - # We now extract out the aggregated data - + # Aggregated data calculation main_building = [part for part in data if "Main" in part["Building Part"]] first_extension = [part for part in data if "1st Extension" in part["Building Part"]] dimensions = { @@ -308,10 +324,17 @@ def extract_building_parts_epr(text): "Total Ground Floor Area (m2)": sum( [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] ), - "RIR Floor Area": 0, - "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building]), + "RIR Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] + ), + "Main Building Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building if + x["Perimeter (m)"] and x["Room Height (m)"]] + ), "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension]) if first_extension else 0, + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension if + x["Perimeter (m)"] and x["Room Height (m)"]] + ) if first_extension else 0, } return dimensions From f97bb7f1273b349abd77f75ff09152af87506f4e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 14:14:40 +0000 Subject: [PATCH 50/59] extract lighting fittings from epr --- etl/customers/stonewater/Wave 3 Preparation.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ee5cd1ca..16970803 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -461,7 +461,10 @@ def extract_epr(pdf_path): 'Total Ground Floor Area (m2)': None, 'RIR Floor Area': None, 'Main Building Wall Area (m2)': None, - 'First Extension Wall Area (m2)': None + 'First Extension Wall Area (m2)': None, + "Number of Light Fittings": None, + "Number of LEL Fittings": None, + "Number of fittings needing LEL": None } with open(pdf_path, "rb") as file: @@ -573,6 +576,13 @@ def extract_epr(pdf_path): building_parts = extract_building_parts_epr(text) data.update(building_parts) + # Get number of lighting outlets and number of fittings needing LEL + lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) + lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + return data From bccf3c621bbec73ac35a18f123ba73b456c695df Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 14:17:20 +0000 Subject: [PATCH 51/59] lighting fitting extraction from summary report --- etl/customers/stonewater/Wave 3 Preparation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 16970803..ccd062e2 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -73,7 +73,10 @@ def extract_summary_report(pdf_path): 'Total Ground Floor Area (m2)': None, 'RIR Floor Area': None, 'Main Building Wall Area (m2)': None, - 'First Extension Wall Area (m2)': None + 'First Extension Wall Area (m2)': None, + "Number of Light Fittings": None, + "Number of LEL Fittings": None, + "Number of fittings needing LEL": None } with open(pdf_path, "rb") as file: @@ -198,6 +201,10 @@ def extract_summary_report(pdf_path): dimensions = extract_building_parts_summary(text) data.update(dimensions) + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + return data @@ -771,8 +778,6 @@ def main(): extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) - # TODO: RIR floor area!!! - # Remove some definite duplicates dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"] dupes = extracted_data[extracted_data["Address"].isin(dupes)] From 7e26fb4b86eee0c5f0ab3bd4e562796d44c5d0a7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 20:30:05 +0000 Subject: [PATCH 52/59] working on proposed sample for stonewater --- .../stonewater/Wave 3 Preparation.py | 203 +++++++++++++++++- 1 file changed, 201 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ccd062e2..bfdc8beb 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -486,7 +486,7 @@ def extract_epr(pdf_path): data["Postcode"] = data["Address"].split(",")[-1].strip() # Extract Current and Potential SAP ratings - sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text) + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) data["Current SAP Rating"] = current_sap @@ -896,7 +896,6 @@ def main(): # Find Osmosis IDs that are in the packages board but not in the matching looking missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"]) missing_ids = list(missing_ids) - print(len(missing_ids)) if missing_ids: # We check that the missing ids have no data yet if len(missing_ids) != 8: @@ -937,6 +936,7 @@ def main(): "Actual SAP Rating", "Modelled SAP Band", "Modelled SAP Rating", + "Package Ref", ] + measure_columns ], on=["Address ID", "Name"], @@ -995,7 +995,206 @@ def main(): if stonewater_data["Address ID"].duplicated().sum(): raise Exception("Duplicate Address IDs") + # Save this data to excel + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages.xlsx", index=False) + + cost_sheet = [ + { + "measure": "EWI 0.30 w.m2.K", "cost": 298.35, "unit": "m2" + }, + { + "measure": "CWI RdSAP Default", "cost": 14.21, "unit": "m2" + }, + { + "measure": "Poss Extract CWI & Refill (issues identified)", "cost": 14.21 + 25, "unit": "m2" + }, + { + "measure": "IWI 0.30 w.m2.K", "cost": 244.80, "unit": "m2" + }, + { + "measure": "EWI/IWI 0.3", "cost": (298.35 + 244.8) / 2, "unit": "m2" + }, + { + "measure": "Loft Insulation 0.11 w.m2.K", "cost": 16.07, "unit": "m2" + }, + { + "measure": "Flat Roof 0.11 w.m2.K", "cost": 195, "unit": "m2" + }, + { + "measure": "DG Window 1.30 w.m2.K", "cost": 1140, "unit": "each" + }, + { + "measure": "Secondary 2.40", "cost": 974, "unit": "each" + }, + { + "measure": "Ins. Door 1.30 w.m2.K", "cost": None, "unit": "each" + }, + { + "measure": "Ins. Door 1.40 w.m2.K", "cost": None, "unit": "each" + }, + { + "measure": "DMEV", "cost": 900, "unit": "each" + }, + { + "measure": "ASHP Vaillant 102607 5kw", "cost": None, "unit": "each" + }, + { + "measure": "HHRSH Quantum 150", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 210lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 160lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 110lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Smart Thermostat", "cost": 1200, "unit": "each" + }, + { + "measure": "TRV's", "cost": 350, "unit": "each" + }, + { + "measure": "Solar PV - 3.0kwp", "cost": 4365.0, "unit": "each" + }, + { + "measure": "Solar PV - 1.5kwp", "cost": 3881, "unit": "each" + }, + { + "measure": "LEL", "cost": 35, "unit": "per bulb" + }, + { + "measure": "Roof 0.16 - Walls 0.30", "cost": 180, "unit": "floor area m2" + }, + { + "measure": "Roof 0.16 - Walls 0.16", "cost": 180, "unit": "floor area m2" + }, + ] + cost_sheet = pd.DataFrame(cost_sheet) + + # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater + cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False) + + stonewater_data["Room in Roof"].value_counts() + # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values + create_proposed_wave_3_bid( + costed_packages_filepath=os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP).xlsx" + ), + archetypes_sheet_filepath=os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" + ) + ) + + +def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath): + # We read in the costed packages + costed_packages = pd.read_excel(costed_packages_filepath) + + archetypes_to_cost = costed_packages[ + [ + "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band", + "Modelled SAP Rating", 'Total Cost of Measures', 'Contingency Cost', + 'Total Cost of Measures inc Contingency' + ] + ].copy() + + # We take properties that are EPC D and below (61% of units) + archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] + + archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"]) + + average_cost = archetypes_to_cost[ + archetypes_to_cost["Has been modelled"] + ]['Total Cost of Measures inc Contingency'].mean() + print(average_cost) + + # These are the Arhetypes that will likely be suitable for Wave 3 + archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4) + archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])] + archetypes_sheet = archetypes_sheet[archetypes_sheet["Address ID"] != "Address ID"] + archetypes_sheet["Address ID"] = archetypes_sheet["Address ID"].astype(int) + + # We merge the property details onto the costed archetypes + archetypes_to_cost = archetypes_to_cost.merge( + archetypes_sheet[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + on="Address ID", + how="left" + ) + + proposed_sample = archetypes_sheet[archetypes_sheet["Archetype ID"].isin(archetypes_to_cost["Archetype ID"])] + + proposed_sample = proposed_sample[ + [ + "Name", "Postcode", "UPRN", "UDPRN", "Address ID", "Osm. ID", "Archetype ID", + "Property Type", "Wall Type", "Roof Type", "Heating" + ] + ] + + # We classify into high and low confidence + + match_classification = [] + for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): + surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]] + # We now check if we have a perfect match + surveyed = surveyed[ + (surveyed["Property Type"] == home["Property Type"]) & + (surveyed["Wall Type"] == home["Wall Type"]) & + (surveyed["Roof Type"] == home["Roof Type"]) & + (surveyed["Heating"] == home["Heating"]) + ] + + if surveyed.empty: + match_classification.append( + { + "Address ID": home["Address ID"], + "Match to Surveyed": "Approximate" + } + ) + continue + match_classification.append( + { + "Address ID": home["Address ID"], + "Match to Surveyed": "Exact" + } + ) + + match_classification = pd.DataFrame(match_classification) + + proposed_sample = proposed_sample.merge( + match_classification, + on="Address ID", + how="left", + ) + + # Merge on the cost per archetype + cost_per_archetype = ( + archetypes_to_cost.groupby("Archetype ID")[['Total Cost of Measures inc Contingency']].mean().reset_index() + ) + proposed_sample = proposed_sample.merge( + cost_per_archetype, + on="Archetype ID", + how="left" + ) + + # We add on a boolean to indicate if a property from that archetype has been modelled + proposed_sample = proposed_sample.merge( + archetypes_to_cost.groupby("Archetype ID")[["Has been modelled"]].any().reset_index(), + on="Archetype ID", + how="left" + ) + + proposed_sample["Total Cost of Measures inc Contingency"] = np.where( + ~proposed_sample["Has been modelled"], + None, proposed_sample["Total Cost of Measures inc Contingency"] + ) + + # Save excel + proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid (WIP).xlsx", index=False) + # if __name__ == "__main__": # main() From a9ea89d2ae5253453e227c83c067f8a248d3f893 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Oct 2024 12:03:17 +0000 Subject: [PATCH 53/59] done with stonewater for now --- .../stonewater/Wave 3 Preparation.py | 144 ++++++++++++++++-- 1 file changed, 133 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bfdc8beb..477a73c8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -76,10 +76,13 @@ def extract_summary_report(pdf_path): 'First Extension Wall Area (m2)': None, "Number of Light Fittings": None, "Number of LEL Fittings": None, - "Number of fittings needing LEL": None + "Number of fittings needing LEL": None, + "Main Roof Type": None, + "Main Roof Insulation": None, + "Main Roof Insulation Thickness": None, } - with open(pdf_path, "rb") as file: + with (open(pdf_path, "rb") as file): reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: @@ -205,6 +208,27 @@ def extract_summary_report(pdf_path): data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + roof_section = re.search(r"8\.0 Roofs:\n(.*?)\n9\.0 Floors:", text, re.DOTALL) + roof_text = roof_section.group(1).strip() + roof_type_match = re.search(r"Type\s*([A-Za-z0-9\s]+)", roof_text) + data["Main Roof Type"] = roof_type_match.group(1).strip() if roof_type_match else None + + # Check if "Insulation" exists between Type and Insulation Thickness + insulation_search = re.search( + r"Type\s+.*?\n(Insulation\s+(.*?)\n)?(Insulation Thickness\s+(.*?)\n)", roof_text, re.DOTALL + ) + + if insulation_search: + # Insulation match will be present if it exists, otherwise it will be None + insulation_match = insulation_search.group(2) # Optional group for Insulation + insulation_thickness_match = insulation_search.group(4) # Required group for Insulation Thickness + + # Populate insulation fields + data["Main Roof Insulation"] = insulation_match.strip() if insulation_match else None + data["Main Roof Insulation Thickness"] = ( + insulation_thickness_match.strip() if insulation_thickness_match else None + ) + return data @@ -434,6 +458,49 @@ def extract_building_parts_summary(text): return dimensions +import re + + +def extract_roof_details_epr(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the provided EPR PDF text. + """ + # Define data structure to hold results + roof_data = [] + + # Locate each building part section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + + # Extract each building part's data, including roof details + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + + # Clean up the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + + part_details = match.group(2) + + # Extract Roof Type, Roof Insulation, and Roof Insulation Thickness + roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details) + + # Store results for this building part + roof_data.append({ + "Building Part": cleaned_part_name, + "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None, + "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None, + "Roof Insulation Thickness": roof_insulation_thickness_match.group( + 1).strip() if roof_insulation_thickness_match else None, + }) + + return roof_data + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -471,7 +538,10 @@ def extract_epr(pdf_path): 'First Extension Wall Area (m2)': None, "Number of Light Fittings": None, "Number of LEL Fittings": None, - "Number of fittings needing LEL": None + "Number of fittings needing LEL": None, + "Main Roof Type": None, + "Main Roof Insulation": None, + "Main Roof Insulation Thickness": None, } with open(pdf_path, "rb") as file: @@ -590,6 +660,13 @@ def extract_epr(pdf_path): data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + roof_details = extract_roof_details_epr(text) + # Get from the main building + main_roof_details = [r for r in roof_details if "Main" in r["Building Part"]] + data["Main Roof Type"] = main_roof_details[0]["Roof Type"] + data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"] + return data @@ -1077,13 +1154,11 @@ def main(): # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False) - stonewater_data["Room in Roof"].value_counts() - # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values create_proposed_wave_3_bid( costed_packages_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP).xlsx" + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) V2.xlsx" ), archetypes_sheet_filepath=os.path.join( CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" @@ -1098,11 +1173,30 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa archetypes_to_cost = costed_packages[ [ "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band", - "Modelled SAP Rating", 'Total Cost of Measures', 'Contingency Cost', - 'Total Cost of Measures inc Contingency' + "Modelled SAP Rating", "Package Ref", 'Total Cost of Measures', 'Contingency Cost', + 'Total Cost of Measures inc Contingency', 'Main Roof Type', 'Main Roof Insulation', + 'Main Roof Insulation Thickness', 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference' ] ].copy() + # Combine 'Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', separating by colons! + archetypes_to_cost['Surveyed Main Roof'] = ( + archetypes_to_cost['Main Roof Type'] + ': ' + archetypes_to_cost['Main Roof Insulation'] + ': ' + + archetypes_to_cost['Main Roof Insulation Thickness'].astype(str) + ) + + # Combine the heating systems, separating by colons! + archetypes_to_cost['Surveyed Main Heating'] = ( + archetypes_to_cost['Existing Primary Heating System'] + ': code - ' + archetypes_to_cost[ + 'Existing Primary Heating PCDF Reference'].astype(str) + ) + + archetypes_to_cost = archetypes_to_cost.drop( + columns=['Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', + 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference']) + # We take properties that are EPC D and below (61% of units) archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] @@ -1139,7 +1233,19 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa match_classification = [] for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): - surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]] + + surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]].copy() + surveyed["Package Ref"] = surveyed["Package Ref"].astype(str) + + package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) + package = package.replace("\n", "") + + surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) + surveyed_roofs = surveyed_roofs.replace("\n", "") + + surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) + surveyed_heating = surveyed_heating.replace("\n", "") + # We now check if we have a perfect match surveyed = surveyed[ (surveyed["Property Type"] == home["Property Type"]) & @@ -1149,17 +1255,33 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa ] if surveyed.empty: + if package == "2B2A": + raise Exception("Fix me") match_classification.append( { "Address ID": home["Address ID"], - "Match to Surveyed": "Approximate" + "Match to Surveyed": "Approximate", + "Proposed Package Ref": package, + "Surveyed Archetype Roofs": surveyed_roofs, + "Surveyed Archetype Heating": surveyed_heating } ) continue + # Re-do + package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) + package = package.replace("\n", "") + surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) + surveyed_roofs = surveyed_roofs.replace("\n", "") + surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) + surveyed_heating = surveyed_heating.replace("\n", "") + match_classification.append( { "Address ID": home["Address ID"], - "Match to Surveyed": "Exact" + "Match to Surveyed": "Exact", + "Proposed Package Ref": package, + "Surveyed Archetype Roofs": surveyed_roofs, + "Surveyed Archetype Heating": surveyed_heating } ) From 6cf0db87f7a3fc68db02d518f9e57bc28b3fe0c1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Oct 2024 14:35:14 +0000 Subject: [PATCH 54/59] completed packages for first 12 surveys --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/customers/aiha/xml_extraction.py | 139 ++++++++++++++++----------- 3 files changed, 85 insertions(+), 58 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 038e8593..65e0eb1e 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -9,6 +9,32 @@ SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIH CONTINGENCY_RATE = 0.26 +def sap_to_epc(sap_points: int | float): + """ + Simple utility function to convert SAP points to EPC rating. + :param sap_points: numerical value of SAP points, typically between 0 and 100 + :return: + """ + + if sap_points <= 0: + raise ValueError("SAP points should be above 0.") + + if sap_points >= 92: + return "A" + elif sap_points >= 81: + return "B" + elif sap_points >= 69: + return "C" + elif sap_points >= 55: + return "D" + elif sap_points >= 39: + return "E" + elif sap_points >= 21: + return "F" + else: + return "G" + + def main(): """ This script handles the extraction of data from the XML files in the survey folders. @@ -76,24 +102,14 @@ def main(): # TODO # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft # [Can't remember, not clear - Chenai will check] - # - AIH001-03 instead of cylinder insulation, we could install an air source heat pump but it might not be the - # best option for this property due to it being extrememly large and the walls being uninsulated. It might not - # be performant enough in the winter, when COP will be more like 1.5. - # - AIH001-03 - can add additional 1.6kWp solar PV to flat roof to get close to EPC C. How many occupants are - # in the property? Does it make sense to have such a large solar PV system (5.6kWp)? # - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C # - Potential measure - search for the cylinder and insulate it # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same # buulding [Question for Lewis & Kevin] # - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from # the other unit] - # - AIH001-09 - Why is there assumed secondary heating? [Question for Lewis & Kevin] # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? - # [Question for Lewis & Kevin] - # - AIH001-11 - The layout of this unit is confusing, is there roof access? [NO!!!! - It's a Sun room!!] - # - AIH001-12 - Why was there not access to the cylinder? [Sealed shut] - # - AIH001-12 - Is the need to draught proofing due to the windows? [This would be addressed by deailing with the - # windows] + # [Question for Lewis & Kevin] - [YES - ASHP!!!!] recommended_measures = [ { @@ -114,40 +130,32 @@ def main(): }, { "measure": "Solar PV", - "description": "5.6kWp Solar PV system", + "description": "4kWp Solar PV system", "config": [ { "size": "4kWp", "orientation": "East", "elavation": 30, - "overshading": "Modest", + "overshading": "None or little", }, - { - "size": "1.6kWp", - "orientation": "Horizontal", - "elavation": "Horizontal", - "overshading": "Modest", - } ], - "sap_points": 7, - "ending_sap": 53 + "sap_points": 10, + "ending_sap": 54 }, { - "measure": "Loft Insulation", - "description": "300mm loft insulation", - "floor_area": 80, # Based on area of 1st floor - "sap_points": 8, - "ending_sap": 61 + "measure": "Air Source Heat Pump", + "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)", + "sap_points": 20, + "ending_sap": 74 }, { - "measure": "TTZC", - "description": "Smart Thermostat", - "sap_points": 3, - "ending_sap": 64 + "measure": "Tariff Review", + "description": "Switch to 24-hour tariff", + "sap_points": 15, + "ending_sap": 89 } ], - "notes": "There was no access to the loft for this property and so a loft hatch would need to be " - "installed..." + "notes": "Unclear if the loft is accessible" }, { "survey_key": "AIH001-04", @@ -174,14 +182,14 @@ def main(): "size": "4kWp", "orientation": "South", "elavation": 30, - "overshading": "Modest", + "overshading": "None or little", } ], - "sap_points": 12, - "ending_sap": 67 + "sap_points": 15, + "ending_sap": 70 } ], - "notes": "" + "notes": "Roof is flat, PV array should be installed south facing with elevation" }, { "survey_key": "AIH001-05", @@ -276,7 +284,7 @@ def main(): "measure": "Internal Wall Insulation", "description": "100mm internal wall insulation", "hlp": 24.13 * 2.63, - "sap_points": 5, + "sap_points": 7, "ending_sap": 69, }, { @@ -316,8 +324,32 @@ def main(): "description": "Smart Thermostat", "sap_points": 3, "ending_sap": 56, + }, + { + "measure": "Solar PV", + "description": "1.6kWp Solar PV system", + "config": [ + { + "size": "1.6W", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 6, + "ending_sap": 62 + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension + "sap_points": 8, + "ending_sap": 70, } - ] + ], + "notes": "This property is a house split into 2 flats. We can install a PV array for both units (one array" + "per unit). Area on south-east part of roof is ~22m2 with no overshadowing. Flat roof area is 8m2" + "with modest overshadowing. We suggest a 3.2kWp system, across two units" }, { "survey_key": "AIH001-11", @@ -353,14 +385,7 @@ def main(): "description": "Installation of double glazing", "n_windows": 20, # Counted the bay windows each as 3 "windows_area": 10.66, - "sap_points": 2, - "ending_sap": 48, - }, - { - "measure": "Draught Proofing", - "description": "Window draught proofing improvements", - "n_windows": 20, # Counted the bay windows each as 3 - "sap_points": 1, + "sap_points": 3, "ending_sap": 49, }, { @@ -379,7 +404,7 @@ def main(): }, { "measure": "Air Source Heat Pump", - "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump", + "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)", "sap_points": 15, "ending_sap": 73 }, @@ -497,17 +522,19 @@ def main(): {'item': 'Window draught proofing improvements', 'unit_price': 63, 'unit': 'window'}, {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'}, {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, - {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, - {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'}, - {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump', 'unit_price': 21189, 'unit': 'unit'}, - {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)', 'unit_price': 21189 + 1200, + 'unit': 'unit'}, {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80, 'unit': 'floor_m2'}, - {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'}, {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'}, {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}, + {'item': '1.6kWp Solar PV system', 'unit_price': 3040, 'unit': 'unit_needs_scaffolding'}, + {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, + {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, + {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, ] pricing_data = pd.DataFrame(pricing_data) @@ -587,13 +614,13 @@ def main(): result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left") # Step 5: Calculate the ending SAP. - result_df["ending_sap"] = result_df["starting_sap"] + result_df["total_sap_points"] + result_df["Ending SAP"] = result_df["starting_sap"] + result_df["total_sap_points"] + result_df["Ending EPC Rating"] = result_df["Ending SAP"].apply(sap_to_epc) # Step 6: Merge the result with the measures_data to get the final DataFrame. final_measures = measures_data.merge( result_df, how="left", on="survey_key" ) - -if __name__ == "__main__": - main() +# if __name__ == "__main__": +# main() From 8f8993ab6480f30cbefe0ec8d6295005ba12dc6f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Oct 2024 15:31:09 +0000 Subject: [PATCH 55/59] added some additional aiha packages --- etl/customers/aiha/xml_extraction.py | 78 ++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 65e0eb1e..25917f1e 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -111,6 +111,9 @@ def main(): # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? # [Question for Lewis & Kevin] - [YES - ASHP!!!!] + # TODO: Need AIH001-02 9C Clapton Common + # TODO: Check which properties are in a conservation area + recommended_measures = [ { "survey_key": "AIH001-01", @@ -501,6 +504,81 @@ def main(): } ] }, + { + "survey_key": "AIH001-15", + "starting_sap": 60, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 73.81, # Based on area of main building + "sap_points": 1, + "ending_sap": 61, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 64, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-West", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 71, + "notes": "The array is North-west facing and therefore will be slightly less efficient than south" + "facing, however the impact is not so severe as to make the installation not worthwhile." + "Ground mounted" + } + ] + }, + { + "survey_key": "AIH001-16", + "starting_sap": 60, + "recommended_measures": [ + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (21.56 * 2.60) + (26.79 * 2.8) + (6.74 * 2.60), + "sap_points": 4, + "ending_sap": 64, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 64, + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "sap_points": 1, + "ending_sap": 65, + }, + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "2.4W", + "orientation": "South-East", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 5, + "ending_sap": 70, + } + ] + } ] scaffolding_data = [ From b6cf10287b5867aa20a00123ee8c4de3e590e4a0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Nov 2024 07:20:55 +0000 Subject: [PATCH 56/59] added AIH001-17 --- etl/customers/aiha/xml_extraction.py | 38 +++++++++++++++++++ .../stonewater/Wave 3 Preparation.py | 8 ++-- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 25917f1e..8c5c9008 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -578,6 +578,44 @@ def main(): "ending_sap": 70, } ] + }, + { + "survey_key": "AIH001-17", + "starting_sap": 62, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 63, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 66, + }, + { + "measure": "Solar PV", + "description": "4kWp Solar PV system", + "config": [ + { + "size": "3.2kW", + "orientation": "East", + "elavation": 30, + "overshading": "None or little", + }, + { + "size": "0.8kW", + "orientation": "West", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 12, + "ending_sap": 78, + } + ] } ] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 477a73c8..9f929db1 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -458,9 +458,6 @@ def extract_building_parts_summary(text): return dimensions -import re - - def extract_roof_details_epr(text): """ Extracts roof type, insulation, and insulation thickness for each building part @@ -1158,7 +1155,7 @@ def main(): create_proposed_wave_3_bid( costed_packages_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) V2.xlsx" + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) MR Review v1.xlsx" ), archetypes_sheet_filepath=os.path.join( CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" @@ -1168,7 +1165,8 @@ def main(): def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath): # We read in the costed packages - costed_packages = pd.read_excel(costed_packages_filepath) + # Note: Header as 12 is for Matt Ratcliff's reviewed version + costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages") archetypes_to_cost = costed_packages[ [ From 9ad7d3e46f30ee6a24e5d8c81dbd7f1035c04bee Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 4 Nov 2024 11:24:02 +0000 Subject: [PATCH 57/59] added missing windows age extraction --- etl/customers/aiha/xml_extraction.py | 67 ++++++++++++++++++++++++-- etl/xml_survey_extraction/XmlParser.py | 1 + 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 8c5c9008..7dc516a6 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -113,6 +113,7 @@ def main(): # TODO: Need AIH001-02 9C Clapton Common # TODO: Check which properties are in a conservation area + # TODO: AIH001-16 - Is the loft insulation suitable (already has 100mm in the RIR) recommended_measures = [ { @@ -560,6 +561,7 @@ def main(): { "measure": "Loft Insulation", "description": "300mm loft insulation", + "floor_area": 20.92, # Based on floor area of RIR "sap_points": 1, "ending_sap": 65, }, @@ -616,6 +618,27 @@ def main(): "ending_sap": 78, } ] + }, + { + "survey_key": "AIH001-18", + "starting_sap": 58, + "recommended_measures": [], + + }, + { + "survey_key": "AIH001-19", + "starting_sap": 76, + "recommended_measures": [] + }, + { + "survey_key": "AIH001-20", + "starting_sap": 82, + "recommended_measures": [] + }, + { + "survey_key": "AIH001-21", + "starting_sap": 53, + "recommended_measures": [] } ] @@ -648,6 +671,7 @@ def main(): {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}, {'item': '1.6kWp Solar PV system', 'unit_price': 3040, 'unit': 'unit_needs_scaffolding'}, {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': '2.4kWp Solar PV system', 'unit_price': 3363, 'unit': 'unit_needs_scaffolding'}, {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, @@ -690,8 +714,14 @@ def main(): total_cost = survey.get("total_cost", 0) for measure in survey.get("recommended_measures", []): + # Include hlp and floor_area for each measure if available + hlp = measure.get("hlp", None) + floor_area = measure.get("floor_area", None) + normalized_measures.append({ "survey_key": survey_key, + "hlp": hlp, + "floor_area": floor_area, "starting_sap": starting_sap, "measure": measure["measure"], "description": measure.get("description", ""), @@ -712,16 +742,38 @@ def main(): fill_value=None ).reset_index() + measures_columns = [x for x in pivoted_measures.columns if x not in ["survey_key"]] + # We add a "Cost of" column for each measure + for measure in measures_columns: + pivoted_measures[f"Cost of {measure}"] = None + + pivoted_floor_area = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="floor_area", + aggfunc="first" # Use 'first' since each measure should only appear once per survey_key + ).add_prefix("floor_area - ").reset_index() + + pivoted_hlp = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="hlp", + aggfunc="first" + ).add_prefix("hlp - ").reset_index() + + # Merge hlp and floor_area data + pivoted_measures = pivoted_measures.merge(pivoted_hlp, on="survey_key", how="left") + pivoted_measures = pivoted_measures.merge(pivoted_floor_area, on="survey_key", how="left") + # Step 3: Calculate the total sap points and total cost for each survey. - sap_cost_totals = measures_df.groupby("survey_key").agg( + totals = measures_df.groupby("survey_key").agg( total_sap_points=("sap_points", "sum"), - total_cost_of_measures=("measure_cost", "sum") ).reset_index() # Merge total sap points into the pivoted measures. - pivoted_measures = pd.merge(pivoted_measures, sap_cost_totals, on="survey_key", how="left") - pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE - pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"] + pivoted_measures = pd.merge(pivoted_measures, totals, on="survey_key", how="left") + # pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE + # pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"] # Step 4: Extract starting SAP for each survey key. starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] @@ -738,5 +790,10 @@ def main(): result_df, how="left", on="survey_key" ) + final_measures.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Measures packages.csv") + + # Store costs + pricing_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Pricing data.csv") + # if __name__ == "__main__": # main() diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index fa70b6b7..ef8daf51 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -784,6 +784,7 @@ class XmlParser: glazing_type_lookup = { "ND": "Single glazing", + "1": "double glazing installed before 2002", "2": "double glazing installed during or after 2002", "3": "double glazing, unknown install date", "5": "Single glazing", From 5dc78d6bb9c6b14029488bb27d769967bb4ba658 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 4 Nov 2024 12:14:52 +0000 Subject: [PATCH 58/59] added measures for more properties --- etl/customers/aiha/xml_extraction.py | 105 ++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index 7dc516a6..d193c91e 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -122,6 +122,33 @@ def main(): "recommended_measures": [], "notes": "Is EPC C" }, + { + "survey_key": "AIH001-02", + "starting_sap": 65, + "recommended_measures": [ + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "2.4W", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 72, + "notes": "The array can be mounted on the flat roof, so that panels are south facing" + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 4, + "ending_sap": 76 + } + ], + }, { "survey_key": "AIH001-03", "starting_sap": 43, @@ -622,7 +649,41 @@ def main(): { "survey_key": "AIH001-18", "starting_sap": 58, - "recommended_measures": [], + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 37.52, # Based on area of main building and 1st extension + "sap_points": 7, + "ending_sap": 65, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 66, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 2, + "ending_sap": 68, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 75, + } + ], }, { @@ -638,7 +699,47 @@ def main(): { "survey_key": "AIH001-21", "starting_sap": 53, - "recommended_measures": [] + "recommended_measures": [ + { + "measure": "Cyliner Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 55, + }, + { + "measure": "Roof Insulation", + "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "floor_area": 22.80, # Based on floor area of RIR + "sap_points": 7, + "ending_sap": 62, + }, + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "1.6kWp", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + }, + { + "size": "0.8kWp", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 9, + "ending_sap": 71, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 74, + } + ] } ] From b75ae5f6b8de5855fd5278079de009e9a99ceb0e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 5 Nov 2024 11:34:15 +0000 Subject: [PATCH 59/59] minor --- etl/customers/aiha/xml_extraction.py | 122 ++++++++++++++++++++++----- 1 file changed, 103 insertions(+), 19 deletions(-) diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index d193c91e..531b6752 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -102,8 +102,6 @@ def main(): # TODO # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft # [Can't remember, not clear - Chenai will check] - # - AIH001-04 why couldn't the cylinder be accessed? - treating this could get to the EPC C - # - Potential measure - search for the cylinder and insulate it # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same # buulding [Question for Lewis & Kevin] # - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from @@ -111,9 +109,9 @@ def main(): # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? # [Question for Lewis & Kevin] - [YES - ASHP!!!!] - # TODO: Need AIH001-02 9C Clapton Common # TODO: Check which properties are in a conservation area # TODO: AIH001-16 - Is the loft insulation suitable (already has 100mm in the RIR) + # TODO: Adjust Archetype 14 homes to exclude double glazing? Or should we exclude entirely recommended_measures = [ { @@ -376,6 +374,8 @@ def main(): "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension "sap_points": 8, "ending_sap": 70, + "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, " + "which is also owned by AIHA" } ], "notes": "This property is a house split into 2 flats. We can install a PV array for both units (one array" @@ -419,31 +419,31 @@ def main(): "sap_points": 3, "ending_sap": 49, }, - { - "measure": "Solar PV", - "description": "3.2kWp Solar PV system", - "config": [ - { - "size": "3.2W", - "orientation": "East", - "elavation": 30, - "overshading": "Little or none", - } - ], - "sap_points": 9, - "ending_sap": 58 - }, + # { + # "measure": "Solar PV", + # "description": "3.2kWp Solar PV system", + # "config": [ + # { + # "size": "3.2W", + # "orientation": "East", + # "elavation": 30, + # "overshading": "Little or none", + # } + # ], + # "sap_points": 9, + # "ending_sap": 58 + # }, { "measure": "Air Source Heat Pump", "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)", "sap_points": 15, - "ending_sap": 73 + "ending_sap": 65 }, { "measure": "Tariff Review", "description": "Switch to 24-hour tariff", "sap_points": 15, - "ending_sap": 88 + "ending_sap": 80 } ] }, @@ -740,6 +740,90 @@ def main(): "ending_sap": 74, } ] + }, + { + "survey_key": "AIH001-SIMULATED-01", + "elmhurst_reference": "000020", + "starting_sap": None, + "recommended_measures": [ + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "hlp": (22.35 * 3.24) + (22.13 * 2.53), + "sap_points": 8, + "ending_sap": 52, + }, + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39), # 1st & 2nd extension + "sap_points": 1, + "ending_sap": 53, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 53, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 56, + }, + { + "measure": "Solar PV", + "description": "1.6kWp Solar PV system", + "config": [ + { + "size": "1.6W", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 6, + "ending_sap": 62 + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension + "sap_points": 8, + "ending_sap": 70, + "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, " + "which is also owned by AIHA" + } + ], + "notes": "This was cloned from 80A. There is no existing data for 80B" + }, + { + "survey_key": "AIH001-SIMULATED-05", + "starting_sap": 68, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 42.5, + "sap_points": 1, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 8, + "ending_sap": 77, + } + ] } ]