handling extraction of windows data

2026-07-27 23:35:01 +00:00 · 2024-10-28 20:29:31 +00:00 · 2024-10-28 20:29:31 +00:00 · 8bf5b23410
commit 8bf5b23410
parent 86ca5b4007
2 changed files with 60 additions and 1 deletions
--- a/etl/customers/livewest/route_march_2024_10_28.py
+++ b/etl/customers/livewest/route_march_2024_10_28.py
@ -61,7 +61,8 @@ def app():
                os_api_key="",
                property_type=None,
                fast=True,
-                full_address=full_address
+                full_address=full_address,
+                max_retries=3
            )
            # Force the skipping of estimating the EPC
            searcher.ordnance_survey_client.property_type = None
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -3,6 +3,7 @@ import PyPDF2
 import re
 import pandas as pd
 from tqdm import tqdm
+from collections import Counter

 FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"

@ -19,6 +20,8 @@ def extract_summary_report(pdf_path):
        "Address": None,
        "Current SAP Rating": None,
        "Fuel Bill": None,
+        "Window Age Description": None,
+        "Window Age Description Proportion (%)": None,
    }

    with open(pdf_path, "rb") as file:
@ -61,9 +64,56 @@ def extract_summary_report(pdf_path):
        # Join non-empty parts with a comma
        data["Address"] = ", ".join([part for part in address_parts if part])

+        windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
+        windows_text = windows_section.group(1)
+        window_data = extract_window_age_description(windows_text)
+        data.update(window_data)
+
    return data


+def extract_window_age_description(windows_text):
+    """
+    Extracts the most common window age description and its proportion.
+
+    Parameters:
+        windows_text (str): The text section containing window data.
+
+    Returns:
+        dict: A dictionary with the most common window age description and its proportion.
+    """
+    # Clean up windows_text by removing line breaks for better pattern matching
+    windows_text = windows_text.replace("\n", "")
+
+    # Define possible window age descriptions
+    window_descriptions = [
+        "Double post or during 2002",
+        "Double pre 2002",
+        "Double with unknown install date",
+        "Secondary glazing",
+        "Triple glazing",
+        "Single glazing",
+    ]
+
+    # Count occurrences of each description
+    description_counts = Counter()
+    for description in window_descriptions:
+        matches = re.findall(re.escape(description), windows_text)
+        description_counts[description] = len(matches)
+
+    if not description_counts or not sum(description_counts.values()):
+        raise ValueError("Failed to extract window data.")
+
+    # Determine the most common description and calculate its proportion
+    most_common_description, window_count = description_counts.most_common(1)[0]
+    window_proportion = window_count / sum(description_counts.values()) * 100
+
+    return {
+        "Window Age Description": most_common_description,
+        "Window Age Description Proportion (%)": window_proportion
+    }
+
+
 def extract_epr(pdf_path):
    """
    Extracts specific data from an Energy Report (EPR) PDF file.
@ -74,6 +124,8 @@ def extract_epr(pdf_path):
        "Space Heating": None,
        "Water Heating": None,
        "Fuel Bill": None,
+        "Window Age Description": None,
+        "Window Age Description Proportion (%)": None,
    }

    with open(pdf_path, "rb") as file:
@ -115,6 +167,12 @@ def extract_epr(pdf_path):
        fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
        data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"

+        # Extract the windows data
+        windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
+        windows_text = windows_section.group(1)
+        window_data = extract_window_age_description(windows_text)
+        data.update(window_data)
+
    return data