debugging epr extraction when the dimensions are external

2026-07-27 23:35:01 +00:00 · 2025-01-29 15:24:02 +00:00 · 2025-01-29 15:24:02 +00:00 · fd98721748
commit fd98721748
parent ca7a0e9d10
1 changed files with 13 additions and 2 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -465,7 +465,11 @@ def extract_building_parts_summary(text):
        r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
    )
    if not dimensions_section:
-        raise ValueError("Failed to locate dimensions section in the text.")
+        dimensions_section = re.search(
+            r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
+        )
+        if not dimensions_section:
+            raise ValueError("Failed to locate dimensions section in the text.")

    dimensions_text = dimensions_section.group(1)

@ -898,11 +902,18 @@ def detect_report_type(pdf_path, pdf_file):
    """
    # Attempt to read the first page of the PDF to determine type
    with open(pdf_path, "rb") as file:
+        # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter
+        # This is because the pdf is irregular. We could possibly try a library like fitz to handle this
        reader = PyPDF2.PdfReader(file)
        first_page_text = reader.pages[0].extract_text() if reader.pages else ""
+        n_pages = len(reader.pages)

-        if is_energy_report(first_page_text):
+        if is_energy_report(first_page_text) and n_pages > 3:
+            # The EPR should have more than 3 pages
            return "epr"
+        elif is_energy_report(first_page_text) and n_pages <= 3:
+            # This is a shortened version of the EPR which isn't massively useful
+            return "short_form_epr"
        elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
            return "summary"
        elif is_condition_report(first_page_text):