debugging epr extraction when the dimensions are external

This commit is contained in:
Khalim Conn-Kowlessar 2025-01-29 15:24:02 +00:00
parent ca7a0e9d10
commit fd98721748

View file

@ -465,7 +465,11 @@ def extract_building_parts_summary(text):
r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
)
if not dimensions_section:
raise ValueError("Failed to locate dimensions section in the text.")
dimensions_section = re.search(
r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
)
if not dimensions_section:
raise ValueError("Failed to locate dimensions section in the text.")
dimensions_text = dimensions_section.group(1)
@ -898,11 +902,18 @@ def detect_report_type(pdf_path, pdf_file):
"""
# Attempt to read the first page of the PDF to determine type
with open(pdf_path, "rb") as file:
# This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter
# This is because the pdf is irregular. We could possibly try a library like fitz to handle this
reader = PyPDF2.PdfReader(file)
first_page_text = reader.pages[0].extract_text() if reader.pages else ""
n_pages = len(reader.pages)
if is_energy_report(first_page_text):
if is_energy_report(first_page_text) and n_pages > 3:
# The EPR should have more than 3 pages
return "epr"
elif is_energy_report(first_page_text) and n_pages <= 3:
# This is a shortened version of the EPR which isn't massively useful
return "short_form_epr"
elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
return "summary"
elif is_condition_report(first_page_text):