extracting windows

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-28 08:42:53 +00:00
parent bcbb43ed8f
commit f141aa4d84

View file

@ -478,6 +478,59 @@ class ElmhurstSummaryReportExtractor:
def __init__(self, file_path):
self.file_path = file_path
@staticmethod
def extract_window_age_description(windows_text):
"""
Extracts the most common window age description and its proportion.
Parameters:
windows_text (str): The text section containing window data.
Returns:
dict: A dictionary with the most common window age description and its proportion.
"""
# Clean up windows_text by removing line breaks for better pattern matching
windows_text = windows_text.replace("\n", "")
# Define possible window age descriptions
window_descriptions = [
"Double post or during 2002",
"Double pre 2002",
"Double with unknown install date",
"Secondary glazing",
"Triple glazing",
"Single glazing",
]
# Count occurrences of each description
description_counts = Counter()
for description in window_descriptions:
matches = re.findall(re.escape(description), windows_text)
description_counts[description] = len(matches)
if not description_counts or not sum(description_counts.values()):
raise ValueError("Failed to extract window data.")
# Determine the most common description and calculate its proportion
most_common_description, window_count = description_counts.most_common(1)[0]
window_proportion = window_count / sum(description_counts.values()) * 100
# Get the second most common and the proportion
if window_proportion == 100:
second_most_common_description = None
second_most_common_proportion = 0
else:
second_most_common_description, second_window_count = description_counts.most_common(2)[1]
second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100
return {
"Window Age Description": most_common_description,
"Window Age Description Proportion (%)": window_proportion,
"Secondary Window Age Description": second_most_common_description,
"Secondary Window Age Description Proportion (%)": second_most_common_proportion,
"Number of Windows": sum(description_counts.values())
}
def extract(self):
"""
Extracts specific data from the provided PDF file.
@ -488,8 +541,7 @@ class ElmhurstSummaryReportExtractor:
"""
# Expected keys:
# dict_keys(['Total Number of Doors', 'Number of Insulated
# Doors', 'Number of Light Fittings', 'Number of LEL Fittings', 'Number of fittings needing LEL', 'Windows',
# dict_keys(['Windows',
# 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory',
# 'Water Heating Code'])
@ -569,10 +621,15 @@ class ElmhurstSummaryReportExtractor:
raise ValueError("Could not extract number of insulated doors")
data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
# lighting
data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1))
data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
windows_text = windows_section.group(1)
window_data = extract_window_age_description(windows_text)
data.update(window_data)
if not windows_section:
raise ValueError("Failed to extract window data.")
data["Windows"] = self.extract_window_age_description(windows_section.group(1))
# Extract heating system
# Extract Primary Heating Data
@ -636,10 +693,6 @@ class ElmhurstSummaryReportExtractor:
dimensions = extract_building_parts_summary(text)
data.update(dimensions)
data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1))
data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
extracted_roof_data = extract_roof_details_summary(text)
main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0]
data["Main Roof Type"] = main_roof_data["Roof Type"]