mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
handling extraction of windows data
This commit is contained in:
parent
86ca5b4007
commit
8bf5b23410
2 changed files with 60 additions and 1 deletions
|
|
@ -61,7 +61,8 @@ def app():
|
|||
os_api_key="",
|
||||
property_type=None,
|
||||
fast=True,
|
||||
full_address=full_address
|
||||
full_address=full_address,
|
||||
max_retries=3
|
||||
)
|
||||
# Force the skipping of estimating the EPC
|
||||
searcher.ordnance_survey_client.property_type = None
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import PyPDF2
|
|||
import re
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from collections import Counter
|
||||
|
||||
FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
|
||||
|
||||
|
|
@ -19,6 +20,8 @@ def extract_summary_report(pdf_path):
|
|||
"Address": None,
|
||||
"Current SAP Rating": None,
|
||||
"Fuel Bill": None,
|
||||
"Window Age Description": None,
|
||||
"Window Age Description Proportion (%)": None,
|
||||
}
|
||||
|
||||
with open(pdf_path, "rb") as file:
|
||||
|
|
@ -61,9 +64,56 @@ def extract_summary_report(pdf_path):
|
|||
# Join non-empty parts with a comma
|
||||
data["Address"] = ", ".join([part for part in address_parts if part])
|
||||
|
||||
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
|
||||
windows_text = windows_section.group(1)
|
||||
window_data = extract_window_age_description(windows_text)
|
||||
data.update(window_data)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def extract_window_age_description(windows_text):
|
||||
"""
|
||||
Extracts the most common window age description and its proportion.
|
||||
|
||||
Parameters:
|
||||
windows_text (str): The text section containing window data.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary with the most common window age description and its proportion.
|
||||
"""
|
||||
# Clean up windows_text by removing line breaks for better pattern matching
|
||||
windows_text = windows_text.replace("\n", "")
|
||||
|
||||
# Define possible window age descriptions
|
||||
window_descriptions = [
|
||||
"Double post or during 2002",
|
||||
"Double pre 2002",
|
||||
"Double with unknown install date",
|
||||
"Secondary glazing",
|
||||
"Triple glazing",
|
||||
"Single glazing",
|
||||
]
|
||||
|
||||
# Count occurrences of each description
|
||||
description_counts = Counter()
|
||||
for description in window_descriptions:
|
||||
matches = re.findall(re.escape(description), windows_text)
|
||||
description_counts[description] = len(matches)
|
||||
|
||||
if not description_counts or not sum(description_counts.values()):
|
||||
raise ValueError("Failed to extract window data.")
|
||||
|
||||
# Determine the most common description and calculate its proportion
|
||||
most_common_description, window_count = description_counts.most_common(1)[0]
|
||||
window_proportion = window_count / sum(description_counts.values()) * 100
|
||||
|
||||
return {
|
||||
"Window Age Description": most_common_description,
|
||||
"Window Age Description Proportion (%)": window_proportion
|
||||
}
|
||||
|
||||
|
||||
def extract_epr(pdf_path):
|
||||
"""
|
||||
Extracts specific data from an Energy Report (EPR) PDF file.
|
||||
|
|
@ -74,6 +124,8 @@ def extract_epr(pdf_path):
|
|||
"Space Heating": None,
|
||||
"Water Heating": None,
|
||||
"Fuel Bill": None,
|
||||
"Window Age Description": None,
|
||||
"Window Age Description Proportion (%)": None,
|
||||
}
|
||||
|
||||
with open(pdf_path, "rb") as file:
|
||||
|
|
@ -115,6 +167,12 @@ def extract_epr(pdf_path):
|
|||
fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
|
||||
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
|
||||
|
||||
# Extract the windows data
|
||||
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
|
||||
windows_text = windows_section.group(1)
|
||||
window_data = extract_window_age_description(windows_text)
|
||||
data.update(window_data)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue