From fd98721748c9da95c3660116f33b6aa00d1be01f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 15:24:02 +0000
Subject: [PATCH] debugging epr extraction when the dimensions are external

---
 etl/customers/stonewater/Wave 3 Preparation.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index ee314f17..4db089e7 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -465,7 +465,11 @@ def extract_building_parts_summary(text):
         r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
     )
     if not dimensions_section:
-        raise ValueError("Failed to locate dimensions section in the text.")
+        dimensions_section = re.search(
+            r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
+        )
+        if not dimensions_section:
+            raise ValueError("Failed to locate dimensions section in the text.")
 
     dimensions_text = dimensions_section.group(1)
 
@@ -898,11 +902,18 @@ def detect_report_type(pdf_path, pdf_file):
     """
     # Attempt to read the first page of the PDF to determine type
     with open(pdf_path, "rb") as file:
+        # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter
+        # This is because the pdf is irregular. We could possibly try a library like fitz to handle this
         reader = PyPDF2.PdfReader(file)
         first_page_text = reader.pages[0].extract_text() if reader.pages else ""
+        n_pages = len(reader.pages)
 
-        if is_energy_report(first_page_text):
+        if is_energy_report(first_page_text) and n_pages > 3:
+            # The EPR should have more than 3 pages
             return "epr"
+        elif is_energy_report(first_page_text) and n_pages <= 3:
+            # This is a shortened version of the EPR which isn't massively useful
+            return "short_form_epr"
         elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
             return "summary"
         elif is_condition_report(first_page_text):