mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
fixed address extraction
This commit is contained in:
parent
cf2a94cb36
commit
33ea47e71d
2 changed files with 35 additions and 13 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import PyPDF2
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
|
||||
|
||||
|
|
@ -11,12 +12,12 @@ def extract_summary_report(pdf_path):
|
|||
Data includes:
|
||||
- Current SAP rating
|
||||
- Fuel Bill
|
||||
- Emissions (t/year)
|
||||
- Address
|
||||
"""
|
||||
data = {
|
||||
"Current SAP rating": None,
|
||||
"Address": None,
|
||||
"Current SAP Rating": None,
|
||||
"Fuel Bill": None,
|
||||
"Emissions (t/year)": None,
|
||||
}
|
||||
|
||||
with open(pdf_path, "rb") as file:
|
||||
|
|
@ -28,17 +29,36 @@ def extract_summary_report(pdf_path):
|
|||
# Extract Current SAP rating
|
||||
sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
|
||||
if sap_match:
|
||||
data["Current SAP rating"] = sap_match.group(1)
|
||||
data["Current SAP Rating"] = sap_match.group(1)
|
||||
|
||||
# Extract Fuel Bill
|
||||
fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
|
||||
if fuel_bill_match:
|
||||
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
|
||||
|
||||
# Extract Emissions
|
||||
emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
|
||||
if emissions_match:
|
||||
data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
|
||||
# Extract individual address components
|
||||
postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
|
||||
# region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
|
||||
house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
|
||||
house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
|
||||
street = re.search(r"Street:\s*(.*?)\nLocality:", text)
|
||||
locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
|
||||
town = re.search(r"Town:\s*(.*?)\nCounty:", text)
|
||||
county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)
|
||||
|
||||
# Clean extracted values and remove any prefixes
|
||||
address_parts = [
|
||||
house_no.group(1).strip() if house_no else "",
|
||||
house_name.group(1).strip() if house_name else "",
|
||||
street.group(1).strip() if street else "",
|
||||
locality.group(1).strip() if locality else "",
|
||||
town.group(1).strip() if town else "",
|
||||
county.group(1).strip() if county else "",
|
||||
postcode.group(1).strip() if postcode else ""
|
||||
]
|
||||
|
||||
# Join non-empty parts with a comma
|
||||
data["Address"] = ", ".join([part for part in address_parts if part])
|
||||
|
||||
return data
|
||||
|
||||
|
|
@ -49,8 +69,7 @@ def extract_epr(pdf_path):
|
|||
"""
|
||||
data = {
|
||||
"Address": None,
|
||||
"Estimated Annual Costs": None,
|
||||
"Current SAP": None,
|
||||
"Current SAP Rating": None,
|
||||
"Space Heating": None,
|
||||
"Water Heating": None,
|
||||
"Fuel Bill": None,
|
||||
|
|
@ -82,8 +101,8 @@ def extract_epr(pdf_path):
|
|||
current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
|
||||
# Ensure potential is greater than or equal to current
|
||||
if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
|
||||
data["Current SAP"] = current_sap
|
||||
data["Potential SAP"] = potential_sap
|
||||
data["Current SAP Rating"] = current_sap
|
||||
data["Potential SAP Rating"] = potential_sap
|
||||
else:
|
||||
raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
|
||||
|
||||
|
|
@ -117,6 +136,8 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
|
|||
pdf_path = os.path.join(retrofit_folder_path, summary_report)
|
||||
return extract_summary_report(pdf_path)
|
||||
|
||||
raise Exception("Not Implemented")
|
||||
|
||||
return None # If no relevant PDF is found
|
||||
|
||||
|
||||
|
|
@ -193,7 +214,7 @@ def main():
|
|||
}
|
||||
extracted_data.append(summary_data)
|
||||
|
||||
print("Extracted Data:", extracted_data)
|
||||
extracted_data = pd.DataFrame(extracted_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1 +1,2 @@
|
|||
PyPDF2
|
||||
pandas
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue