From f141aa4d842a38d8133bdf9b586224333f5372be Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 08:42:53 +0000 Subject: [PATCH] extracting windows --- utils/file_data_extraction.py | 71 ++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 2337ea9d..d444bff8 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -478,6 +478,59 @@ class ElmhurstSummaryReportExtractor: def __init__(self, file_path): self.file_path = file_path + @staticmethod + def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + + Parameters: + windows_text (str): The text section containing window data. + + Returns: + dict: A dictionary with the most common window age description and its proportion. + """ + # Clean up windows_text by removing line breaks for better pattern matching + windows_text = windows_text.replace("\n", "") + + # Define possible window age descriptions + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + + # Count occurrences of each description + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + # Determine the most common description and calculate its proportion + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + # Get the second most common and the proportion + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) + } + def extract(self): """ Extracts specific data from the provided PDF file. @@ -488,8 +541,7 @@ class ElmhurstSummaryReportExtractor: """ # Expected keys: - # dict_keys(['Total Number of Doors', 'Number of Insulated - # Doors', 'Number of Light Fittings', 'Number of LEL Fittings', 'Number of fittings needing LEL', 'Windows', + # dict_keys(['Windows', # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', # 'Water Heating Code']) @@ -569,10 +621,15 @@ class ElmhurstSummaryReportExtractor: raise ValueError("Could not extract number of insulated doors") data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + # lighting + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) - windows_text = windows_section.group(1) - window_data = extract_window_age_description(windows_text) - data.update(window_data) + if not windows_section: + raise ValueError("Failed to extract window data.") + data["Windows"] = self.extract_window_age_description(windows_section.group(1)) # Extract heating system # Extract Primary Heating Data @@ -636,10 +693,6 @@ class ElmhurstSummaryReportExtractor: dimensions = extract_building_parts_summary(text) data.update(dimensions) - data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) - data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) - data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] - extracted_roof_data = extract_roof_details_summary(text) main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] data["Main Roof Type"] = main_roof_data["Roof Type"]