diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py index 47b86e89..c19c78b1 100644 --- a/etl/customers/livewest/route_march_2024_10_28.py +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -61,7 +61,8 @@ def app(): os_api_key="", property_type=None, fast=True, - full_address=full_address + full_address=full_address, + max_retries=3 ) # Force the skipping of estimating the EPC searcher.ordnance_survey_client.property_type = None diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fc11f1c0..a8e06416 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,6 +3,7 @@ import PyPDF2 import re import pandas as pd from tqdm import tqdm +from collections import Counter FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" @@ -19,6 +20,8 @@ def extract_summary_report(pdf_path): "Address": None, "Current SAP Rating": None, "Fuel Bill": None, + "Window Age Description": None, + "Window Age Description Proportion (%)": None, } with open(pdf_path, "rb") as file: @@ -61,9 +64,56 @@ def extract_summary_report(pdf_path): # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + return data +def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + + Parameters: + windows_text (str): The text section containing window data. + + Returns: + dict: A dictionary with the most common window age description and its proportion. + """ + # Clean up windows_text by removing line breaks for better pattern matching + windows_text = windows_text.replace("\n", "") + + # Define possible window age descriptions + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + + # Count occurrences of each description + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + # Determine the most common description and calculate its proportion + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion + } + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -74,6 +124,8 @@ def extract_epr(pdf_path): "Space Heating": None, "Water Heating": None, "Fuel Bill": None, + "Window Age Description": None, + "Window Age Description Proportion (%)": None, } with open(pdf_path, "rb") as file: @@ -115,6 +167,12 @@ def extract_epr(pdf_path): fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + # Extract the windows data + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + return data