diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 976a953f..53d5bb34 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -43,6 +43,65 @@ def extract_summary_report(pdf_path): return data +def extract_epr(pdf_path): + """ + Extracts specific data from an Energy Report (EPR) PDF file. + """ + data = { + "Address": None, + "Estimated Annual Costs": None, + "Current SAP": None, + "Space Heating": None, + "Water Heating": None, + "Fuel Bill": None, + } + + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Extract Address + address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + data["Address"] = address_match.group(1).strip() + + # Extract Total Floor Area + area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text) + data["Total Floor Area"] = area_match.group(1) + + # Extract Estimated Annual Costs + cost_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Estimated Annual Costs"] = f"£{cost_match.group(1)}" + + # Extract Current SAP rating + # Updated Regular Expression to find "GG (1-20)" followed by two numbers + sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text) + + # Extract and validate the Current and Potential SAP ratings + current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) + # Ensure potential is greater than or equal to current + if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: + data["Current SAP"] = current_sap + data["Potential SAP"] = potential_sap + else: + raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") + + # Extract Space Heating (kWh) + space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text) + data["Space Heating"] = int(space_heating_match.group(1)) + + # Extract Water Heating (kWh) + water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text) + data["Water Heating"] = int(water_heating_match.group(1)) + + # Extract Fuel Bill (total estimated costs) + fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + return data + + def extract_retrofit_assessment_folder(retrofit_folder_path): """ Handles extraction from a retrofit assessment folder if it exists and has content. @@ -61,22 +120,38 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): return None # If no relevant PDF is found +def is_energy_report(text): + """ + Determines if the provided text indicates that the PDF is an Energy Report. + Returns True if the text contains 'Energy Report'. + """ + return text.startswith("ENERGY REPORT") + + def extract_from_survey_folder_files(survey_folder_path): """ Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists. """ - survey_files = os.listdir(survey_folder_path) + survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")] - # Look for a summary report directly in the survey folder - summary_report = next( - (name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None - ) + for pdf_file in survey_files: + pdf_path = os.path.join(survey_folder_path, pdf_file) - if summary_report: - pdf_path = os.path.join(survey_folder_path, summary_report) - return extract_summary_report(pdf_path) + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" - return None # If no relevant PDF is found + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower(): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) + else: + raise NotImplementedError("Implement me") + + return None def main(): @@ -109,7 +184,6 @@ def main(): } extracted_data.append(summary_data) continue - # If no retrofit folder or it was empty, check files in survey_folder summary_data = extract_from_survey_folder_files(survey_folder_path) if summary_data: