import re class SiteNotesExtractor: """ Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report. """ def __init__(self, pdf_text): """ Initializes the SiteNotesExtractor with the extracted PDF text. """ self.text = pdf_text self.data = {} def extract_sap_rating(self): """ Extracts the current and potential SAP rating from the report. """ pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text) if not pattern: raise ValueError("No SAP rating found in the report") self.data.update({ "Current EPC Band": pattern.group(1), "Current SAP Rating": int(pattern.group(2)), "Potential EPC Band": pattern.group(3), "Potential SAP Rating": int(pattern.group(4)), }) def extract_carbon_emissions(self): """ Extracts the current and adjusted annual carbon emissions (TCO2). """ pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text) if not pattern: raise ValueError("No carbon emissions found in the report") self.data.update({ "Current Carbon Emissions (TCO2)": float(pattern.group(1)), }) def extract_building_dimensions(self): """ Extracts dimensions for each building part and stores them in a list. Handles Main Property and multiple extensions. """ # Locate the Dimensions section dimensions_section = re.search( r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) " r"Party Wall " r"Length \(m\)\n" r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL ) if not dimensions_section: raise ValueError("Failed to locate the dimensions section in the text.") dimensions_text = dimensions_section.group(1) # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.) building_part_pattern = re.compile( r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" ) building_parts = [] for match in building_part_pattern.finditer(dimensions_text): to_append = { "Building Part": match.group(1).strip(), "Part Floor Area (m2)": float(match.group(2)), "Room Height (m)": float(match.group(3)), "Loss Perimeter (m)": float(match.group(4)), "Party Wall Length (m)": float(match.group(5)), } # We calculate the heat loss area to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"] building_parts.append(to_append) if not building_parts: raise ValueError("No building dimensions found in the report") self.data["Building Dimensions"] = building_parts # We calculate some totals self.data["Total Building Dimensions"] = { "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]), "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]), } def extract_bills_estimate(self): """ Extracts the estimated annual energy costs (£) from the report. """ pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text) if not pattern: raise ValueError("No bills estimate found in the report") self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", "")) def extract_all(self): """ Runs all extraction methods and returns a dictionary with extracted data. """ self.extract_sap_rating() self.extract_carbon_emissions() self.extract_bills_estimate() self.extract_building_dimensions() # Extract specific measures # Primary wall # Secondary wall # Roof # Floor # Heating system # Hot water system # Windows # Doors # Lighting # Ventilation # Solar return self.data def extract_walls(self): """ Extracts wall type, insulation, dry-lining, and thickness for each building part, including any alternative wall details within the 7.0 Walls section of the summary PDF text. """ text = self.text wall_data = [] # Isolate the 7.0 Walls section wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL) if not wall_section_match: raise ValueError("Failed to locate the walls section in the text.") wall_section = wall_section_match.group(1) # Define patterns to match walls for each building part wall_pattern = re.compile( r"(?P
Main Property(?: Alternative)?|Extension \d+)\s*\n" r"(?:Construction\s*(?P[^\n]*)\n)?" r"(?:Insulation\s*(?P[^\n]*)\n)?" r"(?:Insulation Thickness\(mm\)\s*(?P[^\n]*)\n)?" r"(?:Wall Thickness Measured\?\s*(?P[^\n]*)\n)?" r"(?:Wall Thickness\(mm\)\s*(?P\d+))?", re.MULTILINE ) # TODO: We aren't effectively picking up alternative walls # alt_wall_pattern = re.compile( # r"Alternative Wall Sheltered\s*.*?\n" # r".*?Construction\s*(?P[^\n]*)\n" # r"Insulation\s*(?P[^\n]*)\n" # r"Insulation Thickness\(mm\)\s*(?P[^\n]*)\n" # r"Wall Thickness Measured\?\s*(?P[^\n]*)\n" # r"Wall Thickness\(mm\)\s*(?P\d+)?", # re.MULTILINE # ) for match in wall_pattern.finditer(wall_section): building_part = match.group("section") # has_alternative_wall = "Alternative" in building_part building_part = "Main Property" if "Main Property" in building_part else building_part wall_entry = { "Building Part": building_part, "Wall Type": match.group("construction") or "Unknown", "Wall Insulation": match.group("insulation") or "Unknown", "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown", "Wall Thickness Measured": match.group("thickness_measured") or "Unknown", "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group( "thickness").isdigit() else None, "Alternative Wall Type": None, "Alternative Wall Insulation": None, "Alternative Insulation Thickness (mm)": None, "Alternative Wall Thickness Measured": None, "Alternative Wall Thickness (mm)": None, } # Check if an alternative wall section exists # if has_alternative_wall: # alt_match = alt_wall_pattern.search(wall_section, match.end()) # if alt_match: # wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown" # wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown" # wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group( # "alt_insulation_thickness") or "Unknown" # wall_entry["Alternative Wall Thickness Measured"] = alt_match.group( # "alt_thickness_measured") or "Unknown" # wall_entry["Alternative Wall Thickness (mm)"] = int( # alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group( # "alt_thickness").isdigit() else None wall_data.append(wall_entry) return wall_data class EPRExtractor: """ Extracts space heating, water heating, and address from an Energy Performance Report (EPR). """ def __init__(self, pdf_text): """ Initializes the EPRExtractor with the extracted PDF text. """ self.text = pdf_text self.data = {} def extract_heating_consumption(self): """ Extracts space heating and water heating values from the report. """ pattern = re.search( r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)", self.text, re.DOTALL ) if not pattern: raise ValueError("No heating data found in the report") self.data.update({ "Space Heating (KWH)": int(pattern.group(1).replace(",", "")), "Water Heating (KWH)": int(pattern.group(2).replace(",", "")) }) def extract_address(self): """ Extracts the full address from the report. """ pattern = re.search( r"Address\s*(.*?)\nTown\s*(.*?)\n", self.text, re.DOTALL ) if not pattern: raise ValueError("No address found in the report") full_address = pattern.group(1).strip() self.data["Address"] = full_address def extract_all(self): """ Runs all extraction methods and returns a dictionary with extracted data. """ self.extract_address() self.extract_heating_consumption() return self.data