From d489b4346fd6e1f940de4fb1f61ca6cd6b10cf24 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 09:14:38 +0000 Subject: [PATCH] extracting secondary heating --- utils/file_data_extraction.py | 116 ++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index d444bff8..20590afd 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -531,6 +531,62 @@ class ElmhurstSummaryReportExtractor: "Number of Windows": sum(description_counts.values()) } + @staticmethod + def extract_primary_heating(text): + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + if primary_heating_section is None: + raise ValueError("Failed to extract primary heating data.") + + primary_text = primary_heating_section.group(1) + + output = { + 'System': re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(1).strip(), + 'PCDF Reference': re.search(r"PCDF boiler Reference\s*(\d+)", primary_text).group(1), + 'Controls': re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(1).strip(), + '% of Heat': int(re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1)) + } + return output + + @staticmethod + def extract_secondary_heating_details(text): + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + + # Defaults + output = { + "System": "", + "PCDF Reference": "", + "Controls": "", + "% of Heat": 0, + "Heating Code": "" + } + if secondary_heating_section is not None: + # Overwrite defaults + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + output["System"] = main_heating_code_match_secondary.group(1).strip() + output["PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) + + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + output["Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + output["% of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + if output["System"] != "": + output["Heating Code"] = ( + secondary_heating_code_match.group(1).strip() if secondary_heating_code_match else "" + ) + + return output + def extract(self): """ Extracts specific data from the provided PDF file. @@ -541,13 +597,11 @@ class ElmhurstSummaryReportExtractor: """ # Expected keys: - # dict_keys(['Windows', + # dict_keys([ # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', # 'Water Heating Code']) - data = { - - } + data = {} with (open(self.file_path, "rb") as file): reader = PyPDF2.PdfReader(file) @@ -631,62 +685,14 @@ class ElmhurstSummaryReportExtractor: raise ValueError("Failed to extract window data.") data["Windows"] = self.extract_window_age_description(windows_section.group(1)) - # Extract heating system - # Extract Primary Heating Data - # Extract Primary Heating Section - primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) - primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 - - primary_text = primary_heating_section.group(1) - - data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( - 1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", primary_text - ).group(1) - data["Existing Primary Heating Controls"] = re.search( - r"Main Heating Controls\s*(.*?)\n", primary_text - ).group(1).strip() - data["Existing Primary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) - ) + data["Primary Heating"] = self.extract_primary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating_details(text) # Extract Secondary Heating Section - secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - - if secondary_heating_section is None: - data["Existing Secondary Heating System"] = "" - data["Existing Secondary Heating PCDF Reference"] = "" - data["Existing Secondary Heating Controls"] = "" - data["Existing Secondary Heating % of Heat"] = 0 - - else: - secondary_text = secondary_heating_section.group(1) - - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text - ) - data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) - second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - data["Existing Secondary Heating Controls"] = ( - second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" - ) - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) - ) # Extract Secondary Heating and Water Heating Codes - secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) - water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - if data["Existing Secondary Heating System"] == "": - data["Secondary Heating Code"] = "" - else: - data["Secondary Heating Code"] = secondary_heating_code_match.group( - 1).strip() if secondary_heating_code_match else "" + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) data["Water Heating Code"] = water_heating_code_match.group(1).strip()