From 753bda6cb0bc4c8de266944c04ab99db7d74da3d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 14:21:01 +0000 Subject: [PATCH] extracting heating systems from summary report --- .../stonewater/Wave 3 Preparation.py | 86 ++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b1b48cec..863a6a6c 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -19,10 +19,26 @@ def extract_summary_report(pdf_path): data = { "Address": None, "Current SAP Rating": None, - "Number of Storeys": None, + "Space Heating": None, + "Water Heating": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, + "Secondary Window Age Description": None, + "Secondary Window Age Description Proportion (%)": None, + "Number of Windows": None, + "Total Number of Doors": None, + "Number of Insulated Doors": None, + "Existing Primary Heating System": None, + "Existing Primary Heating PCDF Reference": None, + "Existing Primary Heating Controls": None, + "Existing Primary Heating % of Heat": None, + "Existing Secondary Heating System": None, + "Existing Secondary Heating PCDF Reference": None, + "Existing Secondary Heating Controls": None, + "Existing Secondary Heating % of Heat": None, + "Secondary Heating Code": None, + "Water Heating Code": None, } with open(pdf_path, "rb") as file: @@ -39,6 +55,10 @@ def extract_summary_report(pdf_path): storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) + # Extract Carbon Emissions + carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) + data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) + # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" @@ -66,12 +86,58 @@ def extract_summary_report(pdf_path): # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) + data["Postcode"] = postcode.group(1).strip() windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) windows_text = windows_section.group(1) window_data = extract_window_age_description(windows_text) data.update(window_data) + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Extract heating system + # Extract Primary Heating Data + # Extract Primary Heating Section + primary_heating_section = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + primary_text).group(1) + data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) + ) + + # Extract Secondary Heating Section + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + secondary_text = secondary_heating_section.group(1) + + data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( + 1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", + secondary_text).group(1).strip() + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + return data @@ -111,9 +177,20 @@ def extract_window_age_description(windows_text): most_common_description, window_count = description_counts.most_common(1)[0] window_proportion = window_count / sum(description_counts.values()) * 100 + # Get the second most common and the proportion + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + return { "Window Age Description": most_common_description, - "Window Age Description Proportion (%)": window_proportion + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) } @@ -129,6 +206,11 @@ def extract_epr(pdf_path): "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, + "Secondary Window Age Description": None, + "Secondary Window Age Description Proportion (%)": None, + "Number of Windows": None, + "Total Number of Doors": None, + "Number of Insulated Doors": None, } with open(pdf_path, "rb") as file: