fixed address extraction

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-28 12:42:28 +00:00
parent cf2a94cb36
commit 33ea47e71d
2 changed files with 35 additions and 13 deletions

View file

@ -1,6 +1,7 @@
import os
import PyPDF2
import re
import pandas as pd
FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
@ -11,12 +12,12 @@ def extract_summary_report(pdf_path):
Data includes:
- Current SAP rating
- Fuel Bill
- Emissions (t/year)
- Address
"""
data = {
"Current SAP rating": None,
"Address": None,
"Current SAP Rating": None,
"Fuel Bill": None,
"Emissions (t/year)": None,
}
with open(pdf_path, "rb") as file:
@ -28,17 +29,36 @@ def extract_summary_report(pdf_path):
# Extract Current SAP rating
sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
if sap_match:
data["Current SAP rating"] = sap_match.group(1)
data["Current SAP Rating"] = sap_match.group(1)
# Extract Fuel Bill
fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
if fuel_bill_match:
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
# Extract Emissions
emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
if emissions_match:
data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
# Extract individual address components
postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
# region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
street = re.search(r"Street:\s*(.*?)\nLocality:", text)
locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
town = re.search(r"Town:\s*(.*?)\nCounty:", text)
county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)
# Clean extracted values and remove any prefixes
address_parts = [
house_no.group(1).strip() if house_no else "",
house_name.group(1).strip() if house_name else "",
street.group(1).strip() if street else "",
locality.group(1).strip() if locality else "",
town.group(1).strip() if town else "",
county.group(1).strip() if county else "",
postcode.group(1).strip() if postcode else ""
]
# Join non-empty parts with a comma
data["Address"] = ", ".join([part for part in address_parts if part])
return data
@ -49,8 +69,7 @@ def extract_epr(pdf_path):
"""
data = {
"Address": None,
"Estimated Annual Costs": None,
"Current SAP": None,
"Current SAP Rating": None,
"Space Heating": None,
"Water Heating": None,
"Fuel Bill": None,
@ -82,8 +101,8 @@ def extract_epr(pdf_path):
current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
# Ensure potential is greater than or equal to current
if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
data["Current SAP"] = current_sap
data["Potential SAP"] = potential_sap
data["Current SAP Rating"] = current_sap
data["Potential SAP Rating"] = potential_sap
else:
raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
@ -117,6 +136,8 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
pdf_path = os.path.join(retrofit_folder_path, summary_report)
return extract_summary_report(pdf_path)
raise Exception("Not Implemented")
return None # If no relevant PDF is found
@ -193,7 +214,7 @@ def main():
}
extracted_data.append(summary_data)
print("Extracted Data:", extracted_data)
extracted_data = pd.DataFrame(extracted_data)
if __name__ == "__main__":