diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 863a6a6c..4ab33732 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -18,6 +18,7 @@ def extract_summary_report(pdf_path): """ data = { "Address": None, + "Postcode": None, "Current SAP Rating": None, "Space Heating": None, "Water Heating": None, @@ -200,7 +201,9 @@ def extract_epr(pdf_path): """ data = { "Address": None, + "Postcode": None, "Current SAP Rating": None, + "Potential SAP Rating": None, "Space Heating": None, "Water Heating": None, "Fuel Bill": None, @@ -211,6 +214,16 @@ def extract_epr(pdf_path): "Number of Windows": None, "Total Number of Doors": None, "Number of Insulated Doors": None, + "Existing Primary Heating System": None, + "Existing Primary Heating PCDF Reference": None, + "Existing Primary Heating Controls": None, + "Existing Primary Heating % of Heat": None, + "Existing Secondary Heating System": None, + "Existing Secondary Heating PCDF Reference": None, + "Existing Secondary Heating Controls": None, + "Existing Secondary Heating % of Heat": None, + "Secondary Heating Code": None, + "Water Heating Code": None, } with open(pdf_path, "rb") as file: @@ -222,41 +235,73 @@ def extract_epr(pdf_path): # Extract Address address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) data["Address"] = address_match.group(1).strip() + data["Postcode"] = data["Address"].split(",")[-1].strip() - # Extract Total Floor Area - # area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) - # data["Total Floor Area"] = area_match.group(1) - - # Extract Current SAP rating - # Updated Regular Expression to find "GG (1-20)" followed by two numbers + # Extract Current and Potential SAP ratings sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text) + current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) + data["Current SAP Rating"] = current_sap - # Extract and validate the Current and Potential SAP ratings - current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) - # Ensure potential is greater than or equal to current - if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: - data["Current SAP Rating"] = current_sap - data["Potential SAP Rating"] = potential_sap - else: - raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") - - # Extract Space Heating (kWh) - space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text) - data["Space Heating"] = int(space_heating_match.group(1)) - - # Extract Water Heating (kWh) - water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text) - data["Water Heating"] = int(water_heating_match.group(1)) - - # Extract Fuel Bill (total estimated costs) + # Extract Fuel Bill fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" - # Extract the windows data + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Extract Primary Heating Section (Main Heating 1) + primary_heating_section = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + primary_text).group(1) + data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) + ) + + # Extract Secondary Heating Section (Main Heating 2) + secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + secondary_text = secondary_heating_section.group(1) + + data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group( + 1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + + if data["Existing Secondary Heating System"] == "": + data["Existing Secondary Heating Controls"] = "" + else: + data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", + secondary_text).group(1).strip() + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip() + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + # Extract Windows information windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) - windows_text = windows_section.group(1) - window_data = extract_window_age_description(windows_text) - data.update(window_data) + if windows_section: + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) return data