mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
extracting epr
This commit is contained in:
parent
0332c77098
commit
cf2a94cb36
1 changed files with 84 additions and 10 deletions
|
|
@ -43,6 +43,65 @@ def extract_summary_report(pdf_path):
|
|||
return data
|
||||
|
||||
|
||||
def extract_epr(pdf_path):
|
||||
"""
|
||||
Extracts specific data from an Energy Report (EPR) PDF file.
|
||||
"""
|
||||
data = {
|
||||
"Address": None,
|
||||
"Estimated Annual Costs": None,
|
||||
"Current SAP": None,
|
||||
"Space Heating": None,
|
||||
"Water Heating": None,
|
||||
"Fuel Bill": None,
|
||||
}
|
||||
|
||||
with open(pdf_path, "rb") as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text()
|
||||
|
||||
# Extract Address
|
||||
address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
|
||||
data["Address"] = address_match.group(1).strip()
|
||||
|
||||
# Extract Total Floor Area
|
||||
area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
|
||||
data["Total Floor Area"] = area_match.group(1)
|
||||
|
||||
# Extract Estimated Annual Costs
|
||||
cost_match = re.search(r"TOTAL\s*£(\d+)", text)
|
||||
data["Estimated Annual Costs"] = f"£{cost_match.group(1)}"
|
||||
|
||||
# Extract Current SAP rating
|
||||
# Updated Regular Expression to find "GG (1-20)" followed by two numbers
|
||||
sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text)
|
||||
|
||||
# Extract and validate the Current and Potential SAP ratings
|
||||
current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
|
||||
# Ensure potential is greater than or equal to current
|
||||
if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
|
||||
data["Current SAP"] = current_sap
|
||||
data["Potential SAP"] = potential_sap
|
||||
else:
|
||||
raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
|
||||
|
||||
# Extract Space Heating (kWh)
|
||||
space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text)
|
||||
data["Space Heating"] = int(space_heating_match.group(1))
|
||||
|
||||
# Extract Water Heating (kWh)
|
||||
water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text)
|
||||
data["Water Heating"] = int(water_heating_match.group(1))
|
||||
|
||||
# Extract Fuel Bill (total estimated costs)
|
||||
fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
|
||||
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def extract_retrofit_assessment_folder(retrofit_folder_path):
|
||||
"""
|
||||
Handles extraction from a retrofit assessment folder if it exists and has content.
|
||||
|
|
@ -61,22 +120,38 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
|
|||
return None # If no relevant PDF is found
|
||||
|
||||
|
||||
def is_energy_report(text):
|
||||
"""
|
||||
Determines if the provided text indicates that the PDF is an Energy Report.
|
||||
Returns True if the text contains 'Energy Report'.
|
||||
"""
|
||||
return text.startswith("ENERGY REPORT")
|
||||
|
||||
|
||||
def extract_from_survey_folder_files(survey_folder_path):
|
||||
"""
|
||||
Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
|
||||
"""
|
||||
survey_files = os.listdir(survey_folder_path)
|
||||
survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")]
|
||||
|
||||
# Look for a summary report directly in the survey folder
|
||||
summary_report = next(
|
||||
(name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None
|
||||
)
|
||||
for pdf_file in survey_files:
|
||||
pdf_path = os.path.join(survey_folder_path, pdf_file)
|
||||
|
||||
if summary_report:
|
||||
pdf_path = os.path.join(survey_folder_path, summary_report)
|
||||
return extract_summary_report(pdf_path)
|
||||
# Attempt to read the first page of the PDF to determine type
|
||||
with open(pdf_path, "rb") as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
first_page_text = reader.pages[0].extract_text() if reader.pages else ""
|
||||
|
||||
return None # If no relevant PDF is found
|
||||
if is_energy_report(first_page_text):
|
||||
# Treat this as an Energy Report
|
||||
return extract_epr(pdf_path)
|
||||
elif "summary" in pdf_file.lower():
|
||||
# Treat this as a Summary Report
|
||||
return extract_summary_report(pdf_path)
|
||||
else:
|
||||
raise NotImplementedError("Implement me")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -109,7 +184,6 @@ def main():
|
|||
}
|
||||
extracted_data.append(summary_data)
|
||||
continue
|
||||
|
||||
# If no retrofit folder or it was empty, check files in survey_folder
|
||||
summary_data = extract_from_survey_folder_files(survey_folder_path)
|
||||
if summary_data:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue