From fd98721748c9da95c3660116f33b6aa00d1be01f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 29 Jan 2025 15:24:02 +0000 Subject: [PATCH] debugging epr extraction when the dimensions are external --- etl/customers/stonewater/Wave 3 Preparation.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ee314f17..4db089e7 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -465,7 +465,11 @@ def extract_building_parts_summary(text): r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL ) if not dimensions_section: - raise ValueError("Failed to locate dimensions section in the text.") + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") dimensions_text = dimensions_section.group(1) @@ -898,11 +902,18 @@ def detect_report_type(pdf_path, pdf_file): """ # Attempt to read the first page of the PDF to determine type with open(pdf_path, "rb") as file: + # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter + # This is because the pdf is irregular. We could possibly try a library like fitz to handle this reader = PyPDF2.PdfReader(file) first_page_text = reader.pages[0].extract_text() if reader.pages else "" + n_pages = len(reader.pages) - if is_energy_report(first_page_text): + if is_energy_report(first_page_text) and n_pages > 3: + # The EPR should have more than 3 pages return "epr" + elif is_energy_report(first_page_text) and n_pages <= 3: + # This is a shortened version of the EPR which isn't massively useful + return "short_form_epr" elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): return "summary" elif is_condition_report(first_page_text):