From 33ea47e71d8b0a226629400dca5b6400b46daf96 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 12:42:28 +0000 Subject: [PATCH] fixed address extraction --- .../stonewater/Wave 3 Preparation.py | 47 ++++++++++++++----- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 53d5bb34..bc567bd2 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,6 +1,7 @@ import os import PyPDF2 import re +import pandas as pd FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -11,12 +12,12 @@ def extract_summary_report(pdf_path): Data includes: - Current SAP rating - Fuel Bill - - Emissions (t/year) + - Address """ data = { - "Current SAP rating": None, + "Address": None, + "Current SAP Rating": None, "Fuel Bill": None, - "Emissions (t/year)": None, } with open(pdf_path, "rb") as file: @@ -28,17 +29,36 @@ def extract_summary_report(pdf_path): # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) if sap_match: - data["Current SAP rating"] = sap_match.group(1) + data["Current SAP Rating"] = sap_match.group(1) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) if fuel_bill_match: data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" - # Extract Emissions - emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text) - if emissions_match: - data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes" + # Extract individual address components + postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + + # Clean extracted values and remove any prefixes + address_parts = [ + house_no.group(1).strip() if house_no else "", + house_name.group(1).strip() if house_name else "", + street.group(1).strip() if street else "", + locality.group(1).strip() if locality else "", + town.group(1).strip() if town else "", + county.group(1).strip() if county else "", + postcode.group(1).strip() if postcode else "" + ] + + # Join non-empty parts with a comma + data["Address"] = ", ".join([part for part in address_parts if part]) return data @@ -49,8 +69,7 @@ def extract_epr(pdf_path): """ data = { "Address": None, - "Estimated Annual Costs": None, - "Current SAP": None, + "Current SAP Rating": None, "Space Heating": None, "Water Heating": None, "Fuel Bill": None, @@ -82,8 +101,8 @@ def extract_epr(pdf_path): current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2)) # Ensure potential is greater than or equal to current if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap: - data["Current SAP"] = current_sap - data["Potential SAP"] = potential_sap + data["Current SAP Rating"] = current_sap + data["Potential SAP Rating"] = potential_sap else: raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.") @@ -117,6 +136,8 @@ def extract_retrofit_assessment_folder(retrofit_folder_path): pdf_path = os.path.join(retrofit_folder_path, summary_report) return extract_summary_report(pdf_path) + raise Exception("Not Implemented") + return None # If no relevant PDF is found @@ -193,7 +214,7 @@ def main(): } extracted_data.append(summary_data) - print("Extracted Data:", extracted_data) + extracted_data = pd.DataFrame(extracted_data) if __name__ == "__main__": diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index e9a5c8ea..2cabb047 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1 +1,2 @@ PyPDF2 +pandas