Model/epr_data_exports/app.py

"""
This is a placeholder script to extract epr data from files, where we can
"""

"""
July 2025 LiveWest Heating Upgrades
"""
import os
import re
import PyPDF2
import pandas as pd
from tqdm import tqdm
from collections import Counter


def extract_window_age_description(windows_text):
    """
    Extracts the most common window age description and its proportion.

    Parameters:
        windows_text (str): The text section containing window data.

    Returns:
        dict: A dictionary with the most common window age description and its proportion.
    """
    # Clean up windows_text by removing line breaks for better pattern matching
    windows_text = windows_text.replace("\n", "")

    # Define possible window age descriptions
    window_descriptions = [
        "Double post or during 2002",
        "Double pre 2002",
        "Double with unknown install date",
        "Secondary glazing",
        "Triple glazing",
        "Single glazing",
        "Double between 2002 \nand 2021",
        "Double between 2002 and 2021"
    ]

    # Count occurrences of each description
    description_counts = Counter()
    for description in window_descriptions:
        matches = re.findall(re.escape(description), windows_text)
        description_counts[description] = len(matches)

    if not description_counts or not sum(description_counts.values()):
        raise ValueError("Failed to extract window data.")

    # Determine the most common description and calculate its proportion
    most_common_description, window_count = description_counts.most_common(1)[0]
    window_proportion = window_count / sum(description_counts.values()) * 100

    # Get the second most common and the proportion
    if window_proportion == 100:
        second_most_common_description = None
        second_most_common_proportion = 0
    else:
        second_most_common_description, second_window_count = description_counts.most_common(2)[1]
        second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100

    return {
        "Window Age Description": most_common_description,
        "Window Age Description Proportion (%)": window_proportion,
        "Secondary Window Age Description": second_most_common_description,
        "Secondary Window Age Description Proportion (%)": second_most_common_proportion,
        "Number of Windows": sum(description_counts.values())
    }


def extract_building_parts_summary(text):
    """
    Extracts building parts and associated dimensions from the summary report PDF.
    This includes Main Property, multiple extensions if they exist, and Room in Roof areas.
    """
    data = []

    # Locate the Dimensions section
    dimensions_section = re.search(
        r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
    )
    if not dimensions_section:
        dimensions_section = re.search(
            r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
        )
        if not dimensions_section:
            raise ValueError("Failed to locate dimensions section in the text.")

    dimensions_text = dimensions_section.group(1)

    # Pattern to extract each building part, starting from Main Property and including extensions
    building_part_pattern = re.compile(
        r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*"
        r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)",
        re.DOTALL
    )

    # Loop through each building part match, including Main Property and extensions
    for match in building_part_pattern.finditer(dimensions_text):
        part_name = match.group(1)
        floor_data = match.group(2)

        # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length
        floor_pattern = re.compile(
            r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
        )

        # Extract data for each floor within the building part
        for floor_match in floor_pattern.finditer(floor_data):
            floor_level = floor_match.group(1)
            floor_area = float(floor_match.group(2))
            room_height = float(floor_match.group(3))
            perimeter = float(floor_match.group(4))
            party_wall_length = float(floor_match.group(5))

            # Append to data list
            data.append({
                "Building Part": part_name,
                "Floor Level": floor_level,
                "Floor Area (m2)": floor_area,
                "Room Height (m)": room_height,
                "Perimeter (m)": perimeter,
                "Party Wall Length (m)": party_wall_length
            })

        # Check specifically for "Room(s) in Roof" entries, which only have Floor Area
        room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)")
        room_in_roof_match = room_in_roof_pattern.search(floor_data)
        if room_in_roof_match:
            floor_area = float(room_in_roof_match.group(1))
            data.append({
                "Building Part": part_name,
                "Floor Level": "Room in Roof",
                "Floor Area (m2)": floor_area,
                "Room Height (m)": None,  # Placeholder for missing data
                "Perimeter (m)": None,  # Placeholder for missing data
                "Party Wall Length (m)": None  # Placeholder for missing data
            })

    # Calculate aggregated dimensions
    main_property = [part for part in data if "Main Property" in part["Building Part"]]
    first_extensions = [part for part in data if "1st Extension" in part["Building Part"]]
    dimensions = {
        "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
        "Total Ground Floor Area (m2)": sum(
            [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]]
        ),
        "RIR Floor Area": sum(
            [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]]
        ),
        "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if
                                             x["Perimeter (m)"] and x["Room Height (m)"]]),
        "First Extension Wall Area (m2)": sum(
            [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if
             x["Perimeter (m)"] and x["Room Height (m)"]]
        ),
    }

    return dimensions


def extract_roof_details_summary(text):
    """
    Extracts roof type, insulation, and insulation thickness for each building part
    in the 8.0 Roofs section of the summary report.
    """
    # Define data structure to hold results
    roof_data = []

    # Locate the entire 8.0 Roofs section
    roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL)
    if not roof_section_match:
        return roof_data  # Return empty if no roof section is found

    # Extract the roof section and append "9.0 Floors:" as the boundary
    roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:"

    # Define pattern to match each building part's roof entry
    building_part_pattern = re.compile(
        r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n"  # Matches each building part label
        r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))"  # Matches Roof Type until the next field, label, or end
        r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?"  # Optional Insulation
        r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?",  # Optional Insulation Thickness
        re.DOTALL
    )

    # Extract each building part's data
    for match in building_part_pattern.finditer(roof_section):
        part_name = match.group(1).strip()  # Building part label
        roof_type = match.group(2).strip()  # Roof Type
        roof_insulation = match.group(3).strip() if match.group(3) else None  # Optional Insulation
        roof_insulation_thickness = match.group(4).strip() if match.group(4) else None  # Optional Thickness

        # Cleaning to handle annoying cases when it comes out like this:
        # 'A Another dwelling above\n1st Extension'
        if roof_type.startswith("A Another dwelling above"):
            roof_type = "A Another dwelling above"

        # Store results for this building part
        roof_data.append({
            "Building Part": part_name,
            "Roof Type": roof_type,
            "Roof Insulation": roof_insulation,
            "Roof Insulation Thickness": roof_insulation_thickness,
        })

    return roof_data


def extract_wall_details_summary(text):
    """
    Extracts wall type, insulation, dry-lining, and thickness for each building part,
    including any alternative wall details within the 7.0 Walls section of the summary PDF text.
    """
    # Define data structure to hold all building part wall entries
    wall_data = []

    # Locate the entire 7.0 Walls section
    wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1)

    # Define pattern to match each building part's wall entry within the section
    building_part_pattern = re.compile(
        r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n"  # Matches each building part label
        r"Type\s+(.*?)\n"  # Matches main wall Type
        r"Insulation\s+(.*?)\n",  # Matches main wall Insulation
        # r"(Dry-lining\s+(.*?)\n)?"  # Optional main wall Dry-lining
        # r"Wall Thickness Unknown\s+(.*?)\n"  # Matches main wall Thickness Unknown
        # r"Wall Thickness \[mm\]\s+(\d+)",  # Matches main wall Thickness
        re.DOTALL
    )

    # Define pattern to capture alternative wall details, if present
    alternative_wall_pattern = re.compile(
        r"Alternative Wall Area.*?\n"  # Matches start of alternative wall section
        r"Alternative Type\s+(.*?)\n"  # Matches alternative wall Type
        r"Alternative Insulation\s+(.*?)\n"  # Matches alternative wall Insulation
        r"(Alternative Dry-lining\s+(.*?)\n)?"  # Optional Alternative Dry-lining
        r"Alternative Wall Thickness Unknown\s+(.*?)\n"  # Matches alternative wall Thickness Unknown
        r"Alternative Wall Thickness\s+(\d+)",  # Matches alternative wall Thickness
        re.DOTALL
    )

    # Find all building part entries within the 7.0 Walls section
    for match in building_part_pattern.finditer(wall_section):

        wall_label = match.group(1).strip()
        main_wall_type = match.group(2).strip()
        main_wall_insulation = match.group(3).strip()
        # main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A"
        # main_wall_thickness_unknown = match.group(6).strip()
        # main_wall_thickness = int(match.group(7))

        # Initialize dictionary for this wall entry
        wall_entry = {
            "Building Part": wall_label,
            "Wall Type": main_wall_type,
            "Wall Insulation": main_wall_insulation,
            # "Wall Dry-lining": main_wall_dry_lining,
            # "Wall Thickness Unknown": main_wall_thickness_unknown,
            # "Wall Thickness (mm)": main_wall_thickness,
            "Alternative Wall Type": None,
            "Alternative Wall Insulation": None,
            "Alternative Wall Dry-lining": "N/A",
            "Alternative Wall Thickness Unknown": None,
            "Alternative Wall Thickness (mm)": None,
        }

        # Check if there's an alternative wall section following this wall entry
        alt_match = alternative_wall_pattern.search(wall_section, match.end())
        if alt_match:
            wall_entry["Alternative Wall Type"] = alt_match.group(1).strip()
            wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip()
            wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A"
            wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip()
            wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6))

        # Append each building part as a dictionary in the wall_data list
        wall_data.append(wall_entry)

    return wall_data


def extract_summary_report(pdf_path):
    """
    Extracts specific data from the provided PDF file.
    Data includes:
    - Current SAP rating
    - Fuel Bill
    - Address
    """

    data = {
        "Address": None,
        "Postcode": None,
        "Current SAP Rating": None,
        "Current EPC Band": None,
        "Fuel Bill": None,
        "Main Building Age Band": None,
        "Number of Storeys": None,
        "Window Age Description": None,
        "Window Age Description Proportion (%)": None,
        "Secondary Window Age Description": None,
        "Secondary Window Age Description Proportion (%)": None,
        "Number of Windows": None,
        "Total Number of Doors": None,
        "Number of Insulated Doors": None,
        "Existing Primary Heating System": None,
        "Existing Primary Heating PCDF Reference": None,
        "Existing Primary Heating Controls": None,
        "Existing Primary Heating % of Heat": None,
        "Existing Secondary Heating System": None,
        "Existing Secondary Heating PCDF Reference": None,
        "Existing Secondary Heating Controls": None,
        "Existing Secondary Heating % of Heat": None,
        "Secondary Heating Code": None,
        "Water Heating Code": None,
        'Total Floor Area (m2)': None,
        'Total Ground Floor Area (m2)': None,
        'RIR Floor Area': None,
        'Main Building Wall Area (m2)': None,
        'First Extension Wall Area (m2)': None,
        "Number of Light Fittings": None,
        "Number of LEL Fittings": None,
        "Number of fittings needing LEL": None,
        "Main Roof Type": None,
        "Main Roof Insulation": None,
        "Main Roof Insulation Thickness": None,
        "Main Wall Type": None,
        "Main Wall Insulation": None,
        "Main Wall Dry-lining": None,
        "Main Wall Thickness": None,
        "Main Building Alternative Wall Type": None,
        "Main Building Alternative Wall Insulation": None,
        "Main Building Alternative Wall Dry-lining": None,
        "Main Building Alternative Wall Thickness": None,
    }

    with (open(pdf_path, "rb") as file):
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()

        # Extract Current SAP rating
        sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
        data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]

        data["Property Type"] = (
            re.search(r"Property type:\s*(.*?)\n2\.0", text, re.DOTALL)
            .group(1).replace('\n', ' ').strip().replace("  ", " ")
        )

        # Extract age
        age_band_match = re.search(
            r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)",
            text
        )
        data["Main Building Age Band"] = age_band_match.group(1)

        # Number of storeys
        storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
        data["Number of Storeys"] = int(storeys_match.group(1))

        # Grab number of heated rooms, number of habitable rooms
        data["Number of Heated Rooms"] = int(re.search(r"Heated Habitable Rooms:\s*(\d+)", text).group(1))
        data["Number of Habitable rooms"] = int(re.search(r"Habitable Rooms:\s*(\d+)", text).group(1))

        # Extract Carbon Emissions
        # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text)
        # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1))

        # Extract Fuel Bill
        fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
        data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"

        # Extract individual address components
        postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
        # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
        house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
        house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
        street = re.search(r"Street:\s*(.*?)\nLocality:", text)
        locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
        town = re.search(r"Town:\s*(.*?)\nCounty:", text)
        county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)

        # Clean extracted values and remove any prefixes
        address_parts = [
            house_no.group(1).strip() if house_no else "",
            house_name.group(1).strip() if house_name else "",
            street.group(1).strip() if street else "",
            locality.group(1).strip() if locality else "",
            town.group(1).strip() if town else "",
            county.group(1).strip() if county else "",
            postcode.group(1).strip() if postcode else ""
        ]

        # Join non-empty parts with a comma
        data["Address"] = ", ".join([part for part in address_parts if part])
        data["Postcode"] = postcode.group(1).strip()

        # windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
        # windows_text = windows_section.group(1)
        # window_data = extract_window_age_description(windows_text)
        # data.update(window_data)

        # Extract Total Number of Doors
        total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text)
        data["Total Number of Doors"] = int(total_doors_match.group(1))

        # Extract Number of Insulated Doors
        insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text)
        data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))

        # Extract heating system
        # Extract Primary Heating Data
        # Extract Primary Heating Section
        primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL)
        primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
        primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2

        primary_text = primary_heating_section.group(1)

        # Handle extracting main heating code:
        mainheat_search = re.search(r"Main Heating Code\s*(.*?)\n", primary_text)
        if mainheat_search is None:
            mainheat_search = re.search(r"Main Heating EES Code\s*(.*?)\n", primary_text)
        if mainheat_search is None:
            mainheat_search = re.search(r"PCDF boiler Reference\s*(.*?)\n", primary_text)

        data["Existing Primary Heating System"] = mainheat_search.group(1).strip()

        data["Existing Primary Heating PCDF Reference"] = re.search(
            r"PCDF boiler Reference\s*(\d+)", primary_text
        ).group(1)

        controls_search = re.search(
            r"Main Heating Controls Sap\s*(.*?)\n", primary_text
        )
        if controls_search is None:
            controls_search = re.search(
                r"Main Heating Controls\s*(.*?)\n", primary_text
            )
        data["Existing Primary Heating Controls"] = controls_search.group(1).strip()
        data["Existing Primary Heating % of Heat"] = int(
            re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1)
        )

        # Extract Secondary Heating Section
        secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)

        if secondary_heating_section is None:
            data["Existing Secondary Heating System"] = ""
            data["Existing Secondary Heating PCDF Reference"] = ""
            data["Existing Secondary Heating Controls"] = ""
            data["Existing Secondary Heating % of Heat"] = 0

        else:
            secondary_text = secondary_heating_section.group(1)

            main_heating_code_match_secondary = re.search(
                r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
            )
            if main_heating_code_match_secondary is None:
                main_heating_code_match_secondary = re.search(
                    r"Main Heating EES Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
                )

            data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip()
            data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
                                                                          secondary_text).group(1)
            second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
            data["Existing Secondary Heating Controls"] = (
                second_heating_controls_match.group(1).strip() if second_heating_controls_match else ""
            )
            data["Existing Secondary Heating % of Heat"] = int(
                re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1)
            )

        # Extract Secondary Heating and Water Heating Codes
        secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
        water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)

        if data["Existing Secondary Heating System"] == "":
            data["Secondary Heating Code"] = ""
        else:
            data["Secondary Heating Code"] = secondary_heating_code_match.group(
                1).strip() if secondary_heating_code_match else ""

        data["Water Heating Code"] = water_heating_code_match.group(1).strip()

        dimensions = extract_building_parts_summary(text)
        data.update(dimensions)

        # Need to get the hot water
        section_match = re.search(r"15\.0.*?\n(.*?)15\.1", text, re.DOTALL)
        section_text = section_match.group(1)

        # Extract Water Heating Code
        code_match = re.search(r"Water Heating Code\s+(\S+)", section_text)
        fuel_match = re.search(r"Water Heating Fuel Type\s+(.+)", section_text)
        if fuel_match is None:
            fuel_type = None
        else:
            fuel_type = fuel_match.group(1).strip()

        code = code_match.group(1)
        data["Hot Water System"] = code
        data["Hot Water Fuel"] = fuel_type

        # data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1))
        # data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
        # data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]

        extracted_roof_data = extract_roof_details_summary(text)
        main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0]
        data["Main Roof Type"] = main_roof_data["Roof Type"]
        data["Main Roof Insulation"] = main_roof_data["Roof Insulation"]
        data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"]

        walls_data = extract_wall_details_summary(text)
        # Get the main building wall data
        main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0]
        data["Main Wall Type"] = main_building_walls["Wall Type"]
        data["Main Wall Insulation"] = main_building_walls["Wall Insulation"]
        # data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"]
        # data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"]
        # data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"]
        # data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"]
        # data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"]
        # data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"]

    return data


folder_location = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/July 2025 Heating Upgrades"

df = pd.read_csv("/Users/khalimconn-kowlessar/Documents/hestia/July 2025 Surveys/export_summary_table.csv")

property_data = []
for _, x in tqdm(df.iterrows(), total=len(df)):

    if not pd.isnull(x["error"]):
        continue

    filepath = x["filepath"]
    if filepath in ["No summary file found"]:
        continue
    summary_data = extract_summary_report(pdf_path=filepath)
    property_data.append(
        {
            **x.to_dict(),
            **summary_data
        }
    )

property_data = pd.DataFrame(property_data)
# Store as excel
property_data.to_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/July 2025 Heating "
    "Upgrades/property_table_24th_july.xlsx"
)

sandwell_data = property_data[property_data["company"] == "sandwell.gov.uk"]
sandwell_data.to_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/July 2025 Heating "
    "Upgrades/Sandwell EPR data (WIP).xlsx"
)