Model/survey_report/extraction/quidos.py

import re


class SiteNotesExtractor:
    """
    Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report.
    """

    def __init__(self, pdf_text):
        """
        Initializes the SiteNotesExtractor with the extracted PDF text.
        """
        self.text = pdf_text
        self.data = {}

    def extract_sap_rating(self):
        """
        Extracts the current and potential SAP rating from the report.
        """
        pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text)

        if not pattern:
            raise ValueError("No SAP rating found in the report")

        self.data.update({
            "Current EPC Band": pattern.group(1),
            "Current SAP Rating": int(pattern.group(2)),
            "Potential EPC Band": pattern.group(3),
            "Potential SAP Rating": int(pattern.group(4)),
        })

    def extract_carbon_emissions(self):
        """
        Extracts the current and adjusted annual carbon emissions (TCO2).
        """
        pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text)

        if not pattern:
            raise ValueError("No carbon emissions found in the report")

        self.data.update({
            "Current Carbon Emissions (TCO2)": float(pattern.group(1)),
        })

    def extract_building_dimensions(self):
        """
        Extracts dimensions for each building part and stores them in a list.
        Handles Main Property and multiple extensions.
        """

        # Locate the Dimensions section
        dimensions_section = re.search(
            r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) "
            r"Party Wall "
            r"Length \(m\)\n"
            r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL
        )

        if not dimensions_section:
            raise ValueError("Failed to locate the dimensions section in the text.")

        dimensions_text = dimensions_section.group(1)

        # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.)
        building_part_pattern = re.compile(
            r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
        )

        building_parts = []
        for match in building_part_pattern.finditer(dimensions_text):
            to_append = {
                "Building Part": match.group(1).strip(),
                "Part Floor Area (m2)": float(match.group(2)),
                "Room Height (m)": float(match.group(3)),
                "Loss Perimeter (m)": float(match.group(4)),
                "Party Wall Length (m)": float(match.group(5)),
            }
            # We calculate the heat loss area
            to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"]
            building_parts.append(to_append)

        if not building_parts:
            raise ValueError("No building dimensions found in the report")

        self.data["Building Dimensions"] = building_parts
        # We calculate some totals
        self.data["Total Building Dimensions"] = {
            "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]),
            "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]),
        }

    def extract_bills_estimate(self):
        """
        Extracts the estimated annual energy costs (£) from the report.
        """
        pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text)

        if not pattern:
            raise ValueError("No bills estimate found in the report")

        self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", ""))

    def extract_all(self):
        """
        Runs all extraction methods and returns a dictionary with extracted data.
        """
        self.extract_sap_rating()
        self.extract_carbon_emissions()
        self.extract_bills_estimate()
        self.extract_building_dimensions()

        # Extract specific measures
        # Primary wall
        # Secondary wall
        # Roof
        # Floor
        # Heating system
        # Hot water system
        # Windows
        # Doors
        # Lighting
        # Ventilation
        # Solar

        return self.data

    def extract_walls(self):
        """
        Extracts wall type, insulation, dry-lining, and thickness for each building part,
        including any alternative wall details within the 7.0 Walls section of the summary PDF text.
        """

        text = self.text
        wall_data = []

        # Isolate the 7.0 Walls section
        wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL)
        if not wall_section_match:
            raise ValueError("Failed to locate the walls section in the text.")

        wall_section = wall_section_match.group(1)

        # Define patterns to match walls for each building part
        wall_pattern = re.compile(
            r"(?P<section>Main Property(?: Alternative)?|Extension \d+)\s*\n"
            r"(?:Construction\s*(?P<construction>[^\n]*)\n)?"
            r"(?:Insulation\s*(?P<insulation>[^\n]*)\n)?"
            r"(?:Insulation Thickness\(mm\)\s*(?P<insulation_thickness>[^\n]*)\n)?"
            r"(?:Wall Thickness Measured\?\s*(?P<thickness_measured>[^\n]*)\n)?"
            r"(?:Wall Thickness\(mm\)\s*(?P<thickness>\d+))?",
            re.MULTILINE
        )

        # TODO: We aren't effectively picking up alternative walls
        # alt_wall_pattern = re.compile(
        #     r"Alternative Wall Sheltered\s*.*?\n"
        #     r".*?Construction\s*(?P<alt_construction>[^\n]*)\n"
        #     r"Insulation\s*(?P<alt_insulation>[^\n]*)\n"
        #     r"Insulation Thickness\(mm\)\s*(?P<alt_insulation_thickness>[^\n]*)\n"
        #     r"Wall Thickness Measured\?\s*(?P<alt_thickness_measured>[^\n]*)\n"
        #     r"Wall Thickness\(mm\)\s*(?P<alt_thickness>\d+)?",
        #     re.MULTILINE
        # )

        for match in wall_pattern.finditer(wall_section):
            building_part = match.group("section")
            # has_alternative_wall = "Alternative" in building_part
            building_part = "Main Property" if "Main Property" in building_part else building_part

            wall_entry = {
                "Building Part": building_part,
                "Wall Type": match.group("construction") or "Unknown",
                "Wall Insulation": match.group("insulation") or "Unknown",
                "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown",
                "Wall Thickness Measured": match.group("thickness_measured") or "Unknown",
                "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group(
                    "thickness").isdigit() else None,
                "Alternative Wall Type": None,
                "Alternative Wall Insulation": None,
                "Alternative Insulation Thickness (mm)": None,
                "Alternative Wall Thickness Measured": None,
                "Alternative Wall Thickness (mm)": None,
            }

            # Check if an alternative wall section exists
            # if has_alternative_wall:
            #     alt_match = alt_wall_pattern.search(wall_section, match.end())
            #     if alt_match:
            #         wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown"
            #         wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown"
            #         wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group(
            #             "alt_insulation_thickness") or "Unknown"
            #         wall_entry["Alternative Wall Thickness Measured"] = alt_match.group(
            #             "alt_thickness_measured") or "Unknown"
            #         wall_entry["Alternative Wall Thickness (mm)"] = int(
            #             alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group(
            #             "alt_thickness").isdigit() else None

            wall_data.append(wall_entry)

        return wall_data


class EPRExtractor:
    """
    Extracts space heating, water heating, and address from an Energy Performance Report (EPR).
    """

    def __init__(self, pdf_text):
        """
        Initializes the EPRExtractor with the extracted PDF text.
        """
        self.text = pdf_text
        self.data = {}

    def extract_heating_consumption(self):
        """
        Extracts space heating and water heating values from the report.
        """
        pattern = re.search(
            r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)",
            self.text,
            re.DOTALL
        )

        if not pattern:
            raise ValueError("No heating data found in the report")

        self.data.update({
            "Space Heating (KWH)": int(pattern.group(1).replace(",", "")),
            "Water Heating (KWH)": int(pattern.group(2).replace(",", ""))
        })

    def extract_address(self):
        """
        Extracts the full address from the report.
        """
        pattern = re.search(
            r"Address\s*(.*?)\nTown\s*(.*?)\n",
            self.text,
            re.DOTALL
        )

        if not pattern:
            raise ValueError("No address found in the report")

        full_address = pattern.group(1).strip()
        self.data["Address"] = full_address

    def extract_all(self):
        """
        Runs all extraction methods and returns a dictionary with extracted data.
        """
        self.extract_address()
        self.extract_heating_consumption()
        return self.data