""" This is a placeholder script to extract epr data from files, where we can """ """ July 2025 LiveWest Heating Upgrades """ import os import re import PyPDF2 import pandas as pd from tqdm import tqdm from collections import Counter def extract_window_age_description(windows_text): """ Extracts the most common window age description and its proportion. Parameters: windows_text (str): The text section containing window data. Returns: dict: A dictionary with the most common window age description and its proportion. """ # Clean up windows_text by removing line breaks for better pattern matching windows_text = windows_text.replace("\n", "") # Define possible window age descriptions window_descriptions = [ "Double post or during 2002", "Double pre 2002", "Double with unknown install date", "Secondary glazing", "Triple glazing", "Single glazing", "Double between 2002 \nand 2021", "Double between 2002 and 2021" ] # Count occurrences of each description description_counts = Counter() for description in window_descriptions: matches = re.findall(re.escape(description), windows_text) description_counts[description] = len(matches) if not description_counts or not sum(description_counts.values()): raise ValueError("Failed to extract window data.") # Determine the most common description and calculate its proportion most_common_description, window_count = description_counts.most_common(1)[0] window_proportion = window_count / sum(description_counts.values()) * 100 # Get the second most common and the proportion if window_proportion == 100: second_most_common_description = None second_most_common_proportion = 0 else: second_most_common_description, second_window_count = description_counts.most_common(2)[1] second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 return { "Window Age Description": most_common_description, "Window Age Description Proportion (%)": window_proportion, "Secondary Window Age Description": second_most_common_description, "Secondary Window Age Description Proportion (%)": second_most_common_proportion, "Number of Windows": sum(description_counts.values()) } def extract_building_parts_summary(text): """ Extracts building parts and associated dimensions from the summary report PDF. This includes Main Property, multiple extensions if they exist, and Room in Roof areas. """ data = [] # Locate the Dimensions section dimensions_section = re.search( r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL ) if not dimensions_section: dimensions_section = re.search( r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL ) if not dimensions_section: raise ValueError("Failed to locate dimensions section in the text.") dimensions_text = dimensions_section.group(1) # Pattern to extract each building part, starting from Main Property and including extensions building_part_pattern = re.compile( r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)", re.DOTALL ) # Loop through each building part match, including Main Property and extensions for match in building_part_pattern.finditer(dimensions_text): part_name = match.group(1) floor_data = match.group(2) # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length floor_pattern = re.compile( r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" ) # Extract data for each floor within the building part for floor_match in floor_pattern.finditer(floor_data): floor_level = floor_match.group(1) floor_area = float(floor_match.group(2)) room_height = float(floor_match.group(3)) perimeter = float(floor_match.group(4)) party_wall_length = float(floor_match.group(5)) # Append to data list data.append({ "Building Part": part_name, "Floor Level": floor_level, "Floor Area (m2)": floor_area, "Room Height (m)": room_height, "Perimeter (m)": perimeter, "Party Wall Length (m)": party_wall_length }) # Check specifically for "Room(s) in Roof" entries, which only have Floor Area room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)") room_in_roof_match = room_in_roof_pattern.search(floor_data) if room_in_roof_match: floor_area = float(room_in_roof_match.group(1)) data.append({ "Building Part": part_name, "Floor Level": "Room in Roof", "Floor Area (m2)": floor_area, "Room Height (m)": None, # Placeholder for missing data "Perimeter (m)": None, # Placeholder for missing data "Party Wall Length (m)": None # Placeholder for missing data }) # Calculate aggregated dimensions main_property = [part for part in data if "Main Property" in part["Building Part"]] first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] dimensions = { "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), "Total Ground Floor Area (m2)": sum( [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] ), "RIR Floor Area": sum( [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] ), "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if x["Perimeter (m)"] and x["Room Height (m)"]]), "First Extension Wall Area (m2)": sum( [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if x["Perimeter (m)"] and x["Room Height (m)"]] ), } return dimensions def extract_roof_details_summary(text): """ Extracts roof type, insulation, and insulation thickness for each building part in the 8.0 Roofs section of the summary report. """ # Define data structure to hold results roof_data = [] # Locate the entire 8.0 Roofs section roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL) if not roof_section_match: return roof_data # Return empty if no roof section is found # Extract the roof section and append "9.0 Floors:" as the boundary roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:" # Define pattern to match each building part's roof entry building_part_pattern = re.compile( r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, or end r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness re.DOTALL ) # Extract each building part's data for match in building_part_pattern.finditer(roof_section): part_name = match.group(1).strip() # Building part label roof_type = match.group(2).strip() # Roof Type roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness # Cleaning to handle annoying cases when it comes out like this: # 'A Another dwelling above\n1st Extension' if roof_type.startswith("A Another dwelling above"): roof_type = "A Another dwelling above" # Store results for this building part roof_data.append({ "Building Part": part_name, "Roof Type": roof_type, "Roof Insulation": roof_insulation, "Roof Insulation Thickness": roof_insulation_thickness, }) return roof_data def extract_wall_details_summary(text): """ Extracts wall type, insulation, dry-lining, and thickness for each building part, including any alternative wall details within the 7.0 Walls section of the summary PDF text. """ # Define data structure to hold all building part wall entries wall_data = [] # Locate the entire 7.0 Walls section wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1) # Define pattern to match each building part's wall entry within the section building_part_pattern = re.compile( r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label r"Type\s+(.*?)\n" # Matches main wall Type r"Insulation\s+(.*?)\n", # Matches main wall Insulation # r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining # r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown # r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness re.DOTALL ) # Define pattern to capture alternative wall details, if present alternative_wall_pattern = re.compile( r"Alternative Wall Area.*?\n" # Matches start of alternative wall section r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness re.DOTALL ) # Find all building part entries within the 7.0 Walls section for match in building_part_pattern.finditer(wall_section): wall_label = match.group(1).strip() main_wall_type = match.group(2).strip() main_wall_insulation = match.group(3).strip() # main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A" # main_wall_thickness_unknown = match.group(6).strip() # main_wall_thickness = int(match.group(7)) # Initialize dictionary for this wall entry wall_entry = { "Building Part": wall_label, "Wall Type": main_wall_type, "Wall Insulation": main_wall_insulation, # "Wall Dry-lining": main_wall_dry_lining, # "Wall Thickness Unknown": main_wall_thickness_unknown, # "Wall Thickness (mm)": main_wall_thickness, "Alternative Wall Type": None, "Alternative Wall Insulation": None, "Alternative Wall Dry-lining": "N/A", "Alternative Wall Thickness Unknown": None, "Alternative Wall Thickness (mm)": None, } # Check if there's an alternative wall section following this wall entry alt_match = alternative_wall_pattern.search(wall_section, match.end()) if alt_match: wall_entry["Alternative Wall Type"] = alt_match.group(1).strip() wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip() wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A" wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip() wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6)) # Append each building part as a dictionary in the wall_data list wall_data.append(wall_entry) return wall_data def extract_summary_report(pdf_path): """ Extracts specific data from the provided PDF file. Data includes: - Current SAP rating - Fuel Bill - Address """ data = { "Address": None, "Postcode": None, "Current SAP Rating": None, "Current EPC Band": None, "Fuel Bill": None, "Main Building Age Band": None, "Number of Storeys": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, "Secondary Window Age Description": None, "Secondary Window Age Description Proportion (%)": None, "Number of Windows": None, "Total Number of Doors": None, "Number of Insulated Doors": None, "Existing Primary Heating System": None, "Existing Primary Heating PCDF Reference": None, "Existing Primary Heating Controls": None, "Existing Primary Heating % of Heat": None, "Existing Secondary Heating System": None, "Existing Secondary Heating PCDF Reference": None, "Existing Secondary Heating Controls": None, "Existing Secondary Heating % of Heat": None, "Secondary Heating Code": None, "Water Heating Code": None, 'Total Floor Area (m2)': None, 'Total Ground Floor Area (m2)': None, 'RIR Floor Area': None, 'Main Building Wall Area (m2)': None, 'First Extension Wall Area (m2)': None, "Number of Light Fittings": None, "Number of LEL Fittings": None, "Number of fittings needing LEL": None, "Main Roof Type": None, "Main Roof Insulation": None, "Main Roof Insulation Thickness": None, "Main Wall Type": None, "Main Wall Insulation": None, "Main Wall Dry-lining": None, "Main Wall Thickness": None, "Main Building Alternative Wall Type": None, "Main Building Alternative Wall Insulation": None, "Main Building Alternative Wall Dry-lining": None, "Main Building Alternative Wall Thickness": None, } with (open(pdf_path, "rb") as file): reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] data["Property Type"] = ( re.search(r"Property type:\s*(.*?)\n2\.0", text, re.DOTALL) .group(1).replace('\n', ' ').strip().replace(" ", " ") ) # Extract age age_band_match = re.search( r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", text ) data["Main Building Age Band"] = age_band_match.group(1) # Number of storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) # Grab number of heated rooms, number of habitable rooms data["Number of Heated Rooms"] = int(re.search(r"Heated Habitable Rooms:\s*(\d+)", text).group(1)) data["Number of Habitable rooms"] = int(re.search(r"Habitable Rooms:\s*(\d+)", text).group(1)) # Extract Carbon Emissions # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" # Extract individual address components postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) street = re.search(r"Street:\s*(.*?)\nLocality:", text) locality = re.search(r"Locality:\s*(.*?)\nTown:", text) town = re.search(r"Town:\s*(.*?)\nCounty:", text) county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) # Clean extracted values and remove any prefixes address_parts = [ house_no.group(1).strip() if house_no else "", house_name.group(1).strip() if house_name else "", street.group(1).strip() if street else "", locality.group(1).strip() if locality else "", town.group(1).strip() if town else "", county.group(1).strip() if county else "", postcode.group(1).strip() if postcode else "" ] # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) data["Postcode"] = postcode.group(1).strip() # windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) # windows_text = windows_section.group(1) # window_data = extract_window_age_description(windows_text) # data.update(window_data) # Extract Total Number of Doors total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) data["Total Number of Doors"] = int(total_doors_match.group(1)) # Extract Number of Insulated Doors insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) # Extract heating system # Extract Primary Heating Data # Extract Primary Heating Section primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 primary_text = primary_heating_section.group(1) # Handle extracting main heating code: mainheat_search = re.search(r"Main Heating Code\s*(.*?)\n", primary_text) if mainheat_search is None: mainheat_search = re.search(r"Main Heating EES Code\s*(.*?)\n", primary_text) if mainheat_search is None: mainheat_search = re.search(r"PCDF boiler Reference\s*(.*?)\n", primary_text) data["Existing Primary Heating System"] = mainheat_search.group(1).strip() data["Existing Primary Heating PCDF Reference"] = re.search( r"PCDF boiler Reference\s*(\d+)", primary_text ).group(1) controls_search = re.search( r"Main Heating Controls Sap\s*(.*?)\n", primary_text ) if controls_search is None: controls_search = re.search( r"Main Heating Controls\s*(.*?)\n", primary_text ) data["Existing Primary Heating Controls"] = controls_search.group(1).strip() data["Existing Primary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) ) # Extract Secondary Heating Section secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) if secondary_heating_section is None: data["Existing Secondary Heating System"] = "" data["Existing Secondary Heating PCDF Reference"] = "" data["Existing Secondary Heating Controls"] = "" data["Existing Secondary Heating % of Heat"] = 0 else: secondary_text = secondary_heating_section.group(1) main_heating_code_match_secondary = re.search( r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text ) if main_heating_code_match_secondary is None: main_heating_code_match_secondary = re.search( r"Main Heating EES Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text ) data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) data["Existing Secondary Heating Controls"] = ( second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" ) data["Existing Secondary Heating % of Heat"] = int( re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) ) # Extract Secondary Heating and Water Heating Codes secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) if data["Existing Secondary Heating System"] == "": data["Secondary Heating Code"] = "" else: data["Secondary Heating Code"] = secondary_heating_code_match.group( 1).strip() if secondary_heating_code_match else "" data["Water Heating Code"] = water_heating_code_match.group(1).strip() dimensions = extract_building_parts_summary(text) data.update(dimensions) # Need to get the hot water section_match = re.search(r"15\.0.*?\n(.*?)15\.1", text, re.DOTALL) section_text = section_match.group(1) # Extract Water Heating Code code_match = re.search(r"Water Heating Code\s+(\S+)", section_text) fuel_match = re.search(r"Water Heating Fuel Type\s+(.+)", section_text) if fuel_match is None: fuel_type = None else: fuel_type = fuel_match.group(1).strip() code = code_match.group(1) data["Hot Water System"] = code data["Hot Water Fuel"] = fuel_type # data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) # data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) # data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] extracted_roof_data = extract_roof_details_summary(text) main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] data["Main Roof Type"] = main_roof_data["Roof Type"] data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] walls_data = extract_wall_details_summary(text) # Get the main building wall data main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] data["Main Wall Type"] = main_building_walls["Wall Type"] data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] # data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] # data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] # data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] # data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] # data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] # data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] return data folder_location = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/July 2025 Heating Upgrades" df = pd.read_csv("/Users/khalimconn-kowlessar/Documents/hestia/July 2025 Surveys/export_summary_table.csv") property_data = [] for _, x in tqdm(df.iterrows(), total=len(df)): if not pd.isnull(x["error"]): continue filepath = x["filepath"] if filepath in ["No summary file found"]: continue summary_data = extract_summary_report(pdf_path=filepath) property_data.append( { **x.to_dict(), **summary_data } ) property_data = pd.DataFrame(property_data) # Store as excel property_data.to_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/July 2025 Heating " "Upgrades/property_table_24th_july.xlsx" ) sandwell_data = property_data[property_data["company"] == "sandwell.gov.uk"] sandwell_data.to_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/July 2025 Heating " "Upgrades/Sandwell EPR data (WIP).xlsx" )