mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
debugging epr extraction when the dimensions are external
This commit is contained in:
parent
ca7a0e9d10
commit
fd98721748
1 changed files with 13 additions and 2 deletions
|
|
@ -465,7 +465,11 @@ def extract_building_parts_summary(text):
|
|||
r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
|
||||
)
|
||||
if not dimensions_section:
|
||||
raise ValueError("Failed to locate dimensions section in the text.")
|
||||
dimensions_section = re.search(
|
||||
r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
|
||||
)
|
||||
if not dimensions_section:
|
||||
raise ValueError("Failed to locate dimensions section in the text.")
|
||||
|
||||
dimensions_text = dimensions_section.group(1)
|
||||
|
||||
|
|
@ -898,11 +902,18 @@ def detect_report_type(pdf_path, pdf_file):
|
|||
"""
|
||||
# Attempt to read the first page of the PDF to determine type
|
||||
with open(pdf_path, "rb") as file:
|
||||
# This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter
|
||||
# This is because the pdf is irregular. We could possibly try a library like fitz to handle this
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
first_page_text = reader.pages[0].extract_text() if reader.pages else ""
|
||||
n_pages = len(reader.pages)
|
||||
|
||||
if is_energy_report(first_page_text):
|
||||
if is_energy_report(first_page_text) and n_pages > 3:
|
||||
# The EPR should have more than 3 pages
|
||||
return "epr"
|
||||
elif is_energy_report(first_page_text) and n_pages <= 3:
|
||||
# This is a shortened version of the EPR which isn't massively useful
|
||||
return "short_form_epr"
|
||||
elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
|
||||
return "summary"
|
||||
elif is_condition_report(first_page_text):
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue