handling extraction of windows data

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-28 20:29:31 +00:00
parent 86ca5b4007
commit 8bf5b23410
2 changed files with 60 additions and 1 deletions

View file

@ -61,7 +61,8 @@ def app():
os_api_key="",
property_type=None,
fast=True,
full_address=full_address
full_address=full_address,
max_retries=3
)
# Force the skipping of estimating the EPC
searcher.ordnance_survey_client.property_type = None

View file

@ -3,6 +3,7 @@ import PyPDF2
import re
import pandas as pd
from tqdm import tqdm
from collections import Counter
FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
@ -19,6 +20,8 @@ def extract_summary_report(pdf_path):
"Address": None,
"Current SAP Rating": None,
"Fuel Bill": None,
"Window Age Description": None,
"Window Age Description Proportion (%)": None,
}
with open(pdf_path, "rb") as file:
@ -61,9 +64,56 @@ def extract_summary_report(pdf_path):
# Join non-empty parts with a comma
data["Address"] = ", ".join([part for part in address_parts if part])
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
windows_text = windows_section.group(1)
window_data = extract_window_age_description(windows_text)
data.update(window_data)
return data
def extract_window_age_description(windows_text):
"""
Extracts the most common window age description and its proportion.
Parameters:
windows_text (str): The text section containing window data.
Returns:
dict: A dictionary with the most common window age description and its proportion.
"""
# Clean up windows_text by removing line breaks for better pattern matching
windows_text = windows_text.replace("\n", "")
# Define possible window age descriptions
window_descriptions = [
"Double post or during 2002",
"Double pre 2002",
"Double with unknown install date",
"Secondary glazing",
"Triple glazing",
"Single glazing",
]
# Count occurrences of each description
description_counts = Counter()
for description in window_descriptions:
matches = re.findall(re.escape(description), windows_text)
description_counts[description] = len(matches)
if not description_counts or not sum(description_counts.values()):
raise ValueError("Failed to extract window data.")
# Determine the most common description and calculate its proportion
most_common_description, window_count = description_counts.most_common(1)[0]
window_proportion = window_count / sum(description_counts.values()) * 100
return {
"Window Age Description": most_common_description,
"Window Age Description Proportion (%)": window_proportion
}
def extract_epr(pdf_path):
"""
Extracts specific data from an Energy Report (EPR) PDF file.
@ -74,6 +124,8 @@ def extract_epr(pdf_path):
"Space Heating": None,
"Water Heating": None,
"Fuel Bill": None,
"Window Age Description": None,
"Window Age Description Proportion (%)": None,
}
with open(pdf_path, "rb") as file:
@ -115,6 +167,12 @@ def extract_epr(pdf_path):
fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
# Extract the windows data
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
windows_text = windows_section.group(1)
window_data = extract_window_age_description(windows_text)
data.update(window_data)
return data