adding to extract eprs

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-29 14:46:01 +00:00
parent 753bda6cb0
commit 364b5b07e8

View file

@ -18,6 +18,7 @@ def extract_summary_report(pdf_path):
"""
data = {
"Address": None,
"Postcode": None,
"Current SAP Rating": None,
"Space Heating": None,
"Water Heating": None,
@ -200,7 +201,9 @@ def extract_epr(pdf_path):
"""
data = {
"Address": None,
"Postcode": None,
"Current SAP Rating": None,
"Potential SAP Rating": None,
"Space Heating": None,
"Water Heating": None,
"Fuel Bill": None,
@ -211,6 +214,16 @@ def extract_epr(pdf_path):
"Number of Windows": None,
"Total Number of Doors": None,
"Number of Insulated Doors": None,
"Existing Primary Heating System": None,
"Existing Primary Heating PCDF Reference": None,
"Existing Primary Heating Controls": None,
"Existing Primary Heating % of Heat": None,
"Existing Secondary Heating System": None,
"Existing Secondary Heating PCDF Reference": None,
"Existing Secondary Heating Controls": None,
"Existing Secondary Heating % of Heat": None,
"Secondary Heating Code": None,
"Water Heating Code": None,
}
with open(pdf_path, "rb") as file:
@ -222,41 +235,73 @@ def extract_epr(pdf_path):
# Extract Address
address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
data["Address"] = address_match.group(1).strip()
data["Postcode"] = data["Address"].split(",")[-1].strip()
# Extract Total Floor Area
# area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
# data["Total Floor Area"] = area_match.group(1)
# Extract Current SAP rating
# Updated Regular Expression to find "GG (1-20)" followed by two numbers
# Extract Current and Potential SAP ratings
sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text)
current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
data["Current SAP Rating"] = current_sap
# Extract and validate the Current and Potential SAP ratings
current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
# Ensure potential is greater than or equal to current
if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
data["Current SAP Rating"] = current_sap
data["Potential SAP Rating"] = potential_sap
else:
raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
# Extract Space Heating (kWh)
space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text)
data["Space Heating"] = int(space_heating_match.group(1))
# Extract Water Heating (kWh)
water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text)
data["Water Heating"] = int(water_heating_match.group(1))
# Extract Fuel Bill (total estimated costs)
# Extract Fuel Bill
fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
# Extract the windows data
# Extract Total Number of Doors
total_doors_match = re.search(r"Total Doors:\s*(\d+)", text)
data["Total Number of Doors"] = int(total_doors_match.group(1))
# Extract Number of Insulated Doors
insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text)
data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
# Extract Primary Heating Section (Main Heating 1)
primary_heating_section = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
primary_text = primary_heating_section.group(1)
data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(
1).strip()
data["Existing Primary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
primary_text).group(1)
data["Existing Primary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(
1).strip()
data["Existing Primary Heating % of Heat"] = int(
re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1)
)
# Extract Secondary Heating Section (Main Heating 2)
secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL)
secondary_text = secondary_heating_section.group(1)
data["Existing Secondary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", secondary_text).group(
1).strip()
data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
secondary_text).group(1)
if data["Existing Secondary Heating System"] == "":
data["Existing Secondary Heating Controls"] = ""
else:
data["Existing Secondary Heating Controls"] = re.search(r"Main Heating Controls\s*(.*?)\n",
secondary_text).group(1).strip()
data["Existing Secondary Heating % of Heat"] = int(
re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1)
)
# Extract Secondary Heating and Water Heating Codes
secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
if data["Existing Secondary Heating System"] == "":
data["Secondary Heating Code"] = ""
else:
data["Secondary Heating Code"] = secondary_heating_code_match.group(1).strip()
data["Water Heating Code"] = water_heating_code_match.group(1).strip()
# Extract Windows information
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
windows_text = windows_section.group(1)
window_data = extract_window_age_description(windows_text)
data.update(window_data)
if windows_section:
windows_text = windows_section.group(1)
window_data = extract_window_age_description(windows_text)
data.update(window_data)
return data