diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 80c0c19b..f5e014a4 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -801,6 +801,64 @@ class ElmhurstSummaryReportExtractor: return wall_data + @staticmethod + def extract_conservatory(text): + """ + Extracts conservatory data from the provided text. + The section is located between "5.0 Conservatory" and "7.0 Walls". + + Args: + text (str): The full text of the Summary Report PDF. + + Returns: + dict: A dictionary with conservatory details: + - "Conservatory Present" + - "Conservatory Separated" + - "Conservatory Floor Area" + - "Conservatory Double Glazed" + - "Conservatory Glazed Perimeter" + - "Heated Conservatory Height" + """ + + # Extract the section between "5.0 Conservatory" and "7.0 Walls" + conservatory_match = re.search(r"5\.0 Conservatory:(.*?)7\.0 Walls:", text, re.DOTALL) + if not conservatory_match: + logger.error("Failed to extract conservatory data.") + raise ValueError("Could not extract conservatory data.") + + conservatory_text = conservatory_match.group(1) + + # Check if conservatory is present + present_match = re.search(r"Is there a conservatory\?\s*(Yes|No)", conservatory_text, re.IGNORECASE) + + if not present_match or present_match.group(1).strip().lower() == "no": + return { + "Conservatory Present": "No", + "Conservatory Separated": "", + "Conservatory Floor Area": 0, + "Conservatory Double Glazed": "", + "Conservatory Glazed Perimeter": 0, + "Heated Conservatory Height": "", + } + + # If we get here, raise a temporary exception since we've not seen a case of this, so should make sure + # this is correct + + separated_match = re.search(r"Is it thermally separated\?\s*(Yes|No)", conservatory_text, re.IGNORECASE) + floor_area_match = re.search(r"Floor Area \[m2\]\s*([\d.]+)", conservatory_text, re.IGNORECASE) + double_glazed_match = re.search(r"Double Glazed\s*(Yes|No)", conservatory_text, re.IGNORECASE) + glazed_perimeter_match = re.search(r"Glazed Perimeter \[m\]\s*([\d.]+)", conservatory_text, re.IGNORECASE) + height_match = re.search(r"Room Height\s*(.*?)(?=\n|$)", conservatory_text, re.IGNORECASE) + + return { + "Conservatory Present": "Yes", + "Conservatory Separated": separated_match.group(1).strip() if separated_match else "", + "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0, + "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "", + "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0, + "Heated Conservatory Height": height_match.group(1).strip() if height_match else "", + } + def extract(self): """ Extracts specific data from the provided PDF file. @@ -810,11 +868,6 @@ class ElmhurstSummaryReportExtractor: - Address """ - # Expected keys: - # dict_keys([ - # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', - # 'Water Heating Code']) - data = {} with (open(self.file_path, "rb") as file): @@ -904,6 +957,7 @@ class ElmhurstSummaryReportExtractor: data["Building Parts"] = self.extract_building_parts(text) data["Roof Details"] = self.extract_roof_details(text) data["Wall Details"] = self.extract_wall_details(text) + data["Conservatory"] = self.extract_conservatory(text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) if not water_heating_code_match: @@ -911,15 +965,4 @@ class ElmhurstSummaryReportExtractor: data["Water Heating Code"] = water_heating_code_match.group(1).strip() - # Get the main building wall data - main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] - data["Main Wall Type"] = main_building_walls["Wall Type"] - data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] - data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] - data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] - data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] - data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] - data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] - data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] - return data