mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
925 lines
42 KiB
Python
925 lines
42 KiB
Python
import PyPDF2
|
|
import re
|
|
from collections import Counter
|
|
from utils.logger import setup_logger
|
|
from xml.dom.minidom import parseString
|
|
|
|
logger = setup_logger()
|
|
|
|
"""
|
|
This script contains functions used to extract data from retrofit survey files, including EPRs,
|
|
summary reports, etc
|
|
"""
|
|
|
|
|
|
def is_elmhurst_energy_report(text):
|
|
"""
|
|
Determines if the provided text indicates that the PDF is an Energy Report.
|
|
Returns True if the text contains 'Energy Report'.
|
|
"""
|
|
return text.startswith("ENERGY REPORT")
|
|
|
|
|
|
def is_elmhurst_summary_report(text):
|
|
"""
|
|
Determines if the provided text indicates that the PDF is a Summary Report.
|
|
"""
|
|
return text.startswith("Summary Information")
|
|
|
|
|
|
def is_osmosis_condition_report(text):
|
|
"""
|
|
Determines if the provided text indicates that the PDF is a Condition Report.
|
|
"""
|
|
return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport")
|
|
|
|
|
|
def is_elmhurst_evidence_report(text):
|
|
"""
|
|
Determines if the provided text indicates that the PDF is an Elmhurst Evidence Report.
|
|
"""
|
|
return text.startswith("RdSAP Evidence Report")
|
|
|
|
|
|
def detect_pdf_report_type(pdf_path):
|
|
"""
|
|
Detects the type of report based on content or filename.
|
|
:param pdf_path: String path to the PDF file
|
|
:param pdf_file: String name of the PDF file
|
|
:return: String type of the report ("epr", "summary", or None)
|
|
"""
|
|
# Attempt to read the first page of the PDF to determine type
|
|
with open(pdf_path, "rb") as file:
|
|
reader = PyPDF2.PdfReader(file)
|
|
first_page_text = reader.pages[0].extract_text() if reader.pages else ""
|
|
|
|
if is_elmhurst_energy_report(first_page_text):
|
|
return "elmhurst epr"
|
|
elif is_elmhurst_summary_report(first_page_text):
|
|
return "elmhurst summary report"
|
|
elif is_osmosis_condition_report(first_page_text):
|
|
return "osmosis condition report"
|
|
elif is_elmhurst_evidence_report(first_page_text):
|
|
return "elmhurst evidence report"
|
|
|
|
return None
|
|
|
|
|
|
def detect_xml_report_type(xml_path):
|
|
"""
|
|
Detects the type of XML report based on content or filename.
|
|
:param xml_path: String path to the XML file
|
|
:return: String type of the report ("full sap xml", or None)
|
|
"""
|
|
# Attempt to read the first page of the PDF to determine type
|
|
with open(xml_path, "r") as file:
|
|
contents = file.read()
|
|
|
|
contents = parseString(contents)
|
|
product_tag_search = contents.getElementsByTagName("Product")
|
|
if product_tag_search:
|
|
if product_tag_search[0].firstChild.nodeValue == "Sap 2012 Desktop":
|
|
return "full sap xml"
|
|
|
|
raise Exception("Not implemented")
|
|
|
|
|
|
def is_pdf(filename):
|
|
"""
|
|
Determines if the provided filename is a PDF file.
|
|
"""
|
|
return filename.endswith(".pdf")
|
|
|
|
|
|
def is_xml(filename):
|
|
"""
|
|
Determines if the provided filename is an XML file.
|
|
"""
|
|
return filename.endswith(".xml")
|
|
|
|
|
|
class ElmhurstEprExtractor:
|
|
"""
|
|
A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR).
|
|
"""
|
|
|
|
def __init__(self, file_path):
|
|
self.file_path = file_path
|
|
|
|
@staticmethod
|
|
def extract_window_age_description(windows_text):
|
|
"""
|
|
Extracts the most common window age description and its proportion.
|
|
"""
|
|
windows_text = windows_text.replace("\n", "")
|
|
window_descriptions = [
|
|
"Double post or during 2002",
|
|
"Double pre 2002",
|
|
"Double with unknown install date",
|
|
"Secondary glazing",
|
|
"Triple glazing",
|
|
"Single glazing",
|
|
]
|
|
description_counts = Counter()
|
|
for description in window_descriptions:
|
|
matches = re.findall(re.escape(description), windows_text)
|
|
description_counts[description] = len(matches)
|
|
|
|
if not description_counts or not sum(description_counts.values()):
|
|
raise ValueError("Failed to extract window data.")
|
|
|
|
most_common_description, window_count = description_counts.most_common(1)[0]
|
|
window_proportion = window_count / sum(description_counts.values()) * 100
|
|
|
|
if window_proportion == 100:
|
|
second_most_common_description = None
|
|
second_most_common_proportion = 0
|
|
else:
|
|
second_most_common_description, second_window_count = description_counts.most_common(2)[1]
|
|
second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100
|
|
|
|
return {
|
|
"Window Age Description": most_common_description,
|
|
"Window Age Description Proportion (%)": window_proportion,
|
|
"Secondary Window Age Description": second_most_common_description,
|
|
"Secondary Window Age Description Proportion (%)": second_most_common_proportion,
|
|
"Number of Windows": sum(description_counts.values())
|
|
}
|
|
|
|
@staticmethod
|
|
def extract_building_parts(text):
|
|
"""
|
|
Extracts building parts and associated dimensions from the provided text.
|
|
"""
|
|
data = []
|
|
building_part_pattern = re.compile(
|
|
r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party "
|
|
r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)",
|
|
re.DOTALL
|
|
)
|
|
for match in building_part_pattern.finditer(text):
|
|
part_name = match.group(1).strip()
|
|
floor_data = match.group(2)
|
|
room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name)
|
|
if room_in_roof_match:
|
|
floor_area = float(room_in_roof_match.group(1))
|
|
cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
|
|
data.append({
|
|
"Building Part": cleaned_part_name,
|
|
"Floor Level": "Room in Roof",
|
|
"Floor Area (m2)": floor_area,
|
|
"Room Height (m)": None,
|
|
"Perimeter (m)": None,
|
|
"Party Wall Length (m)": None
|
|
})
|
|
else:
|
|
cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip()
|
|
|
|
floor_pattern = re.compile(
|
|
r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
|
|
)
|
|
for floor_match in floor_pattern.finditer(floor_data):
|
|
floor_level = floor_match.group(1)
|
|
floor_area = float(floor_match.group(2))
|
|
room_height = float(floor_match.group(3))
|
|
perimeter = float(floor_match.group(4))
|
|
party_wall_length = float(floor_match.group(5))
|
|
data.append({
|
|
"Building Part": cleaned_part_name,
|
|
"Floor Level": floor_level,
|
|
"Floor Area (m2)": floor_area,
|
|
"Room Height (m)": room_height,
|
|
"Perimeter (m)": perimeter,
|
|
"Party Wall Length (m)": party_wall_length
|
|
})
|
|
|
|
return data
|
|
|
|
@staticmethod
|
|
def extract_roof_details(text):
|
|
"""
|
|
Extracts roof details for each building part in the provided text.
|
|
"""
|
|
roof_data = []
|
|
building_part_pattern = re.compile(
|
|
r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)",
|
|
re.DOTALL
|
|
)
|
|
for match in building_part_pattern.finditer(text):
|
|
part_name = match.group(1).strip()
|
|
cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
|
|
part_details = match.group(2)
|
|
roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details)
|
|
roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details)
|
|
roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details)
|
|
|
|
roof_data.append({
|
|
"Building Part": cleaned_part_name,
|
|
"Roof Type": roof_type_match.group(1).strip() if roof_type_match else None,
|
|
"Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None,
|
|
"Roof Insulation Thickness": roof_insulation_thickness_match.group(
|
|
1).strip() if roof_insulation_thickness_match else None,
|
|
})
|
|
|
|
return roof_data
|
|
|
|
@staticmethod
|
|
def extract_wall_details(text):
|
|
"""
|
|
Extracts wall details for each building part in the provided text.
|
|
"""
|
|
wall_data = []
|
|
building_part_pattern = re.compile(
|
|
r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)",
|
|
re.DOTALL
|
|
)
|
|
for match in building_part_pattern.finditer(text):
|
|
part_name = match.group(1).strip()
|
|
cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
|
|
part_details = match.group(2)
|
|
wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details)
|
|
wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details)
|
|
wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details)
|
|
wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details)
|
|
|
|
wall_data.append({
|
|
"Building Part": cleaned_part_name,
|
|
"Wall Type": wall_type_match.group(1).strip() if wall_type_match else None,
|
|
"Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None,
|
|
"Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None,
|
|
"Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None,
|
|
})
|
|
|
|
return wall_data
|
|
|
|
@staticmethod
|
|
def extract_conservatory(text):
|
|
"""
|
|
Extracts conservatory data from the provided text.
|
|
The section is located between "Conservatory" and "Doors".
|
|
|
|
Args:
|
|
text (str): The full text of the EPR PDF.
|
|
|
|
Returns:
|
|
dict: A dictionary with conservatory details:
|
|
- "Conservatory Present"
|
|
- "Conservatory Separated"
|
|
- "Conservatory Floor Area"
|
|
- "Conservatory Double Glazed"
|
|
- "Conservatory Glazed Perimeter"
|
|
- "Heated Conservatory Height"
|
|
"""
|
|
|
|
conservatory_match = re.search(r"Conservatory\s*(.*?)\s*Doors", text, re.DOTALL)
|
|
if not conservatory_match:
|
|
logger.error("Failed to extract conservatory data.")
|
|
raise ValueError("Could not extract conservatory data.")
|
|
|
|
conservatory_text = conservatory_match.group(1)
|
|
|
|
# Check if conservatory is present
|
|
present_match = re.search(r"Conservatory Present:\s*(Yes|No)", conservatory_text)
|
|
|
|
if not present_match or present_match.group(1).strip() == "No":
|
|
logger.info("Conservatory not present.")
|
|
return {
|
|
"Conservatory Present": "No",
|
|
"Conservatory Separated": "",
|
|
"Conservatory Floor Area": 0,
|
|
"Conservatory Double Glazed": "",
|
|
"Conservatory Glazed Perimeter": 0,
|
|
"Heated Conservatory Height": "",
|
|
}
|
|
|
|
# Extract conservatory details
|
|
separated_match = re.search(r"Conservatory Separated:\s*(Yes|No)", conservatory_text)
|
|
floor_area_match = re.search(r"Conservatory Floor Area:\s*([\d.]+)", conservatory_text)
|
|
double_glazed_match = re.search(r"Conservatory Double Glazed:\s*(Yes|No)", conservatory_text)
|
|
glazed_perimeter_match = re.search(r"Conservatory Glazed Perimeter:\s*([\d.]+)", conservatory_text)
|
|
height_match = re.search(r"Heated Conservatory Height:\s*(.*?)(?=\n|$)", conservatory_text)
|
|
|
|
return {
|
|
"Conservatory Present": "Yes",
|
|
"Conservatory Separated": separated_match.group(1).strip() if separated_match else "",
|
|
"Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0,
|
|
"Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "",
|
|
"Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0,
|
|
"Heated Conservatory Height": height_match.group(1).strip() if height_match else "",
|
|
}
|
|
|
|
@staticmethod
|
|
def _extract_heating_details(section_text, default_value=""):
|
|
"""
|
|
Extracts heating details from a given section of text.
|
|
|
|
Args:
|
|
section_text (str): The section of text containing heating details.
|
|
default_value (str, optional): The default value to return for missing fields. Defaults to "".
|
|
|
|
Returns:
|
|
dict: A dictionary containing heating system details.
|
|
"""
|
|
system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text)
|
|
pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text)
|
|
controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text)
|
|
heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text)
|
|
|
|
return {
|
|
"System": system_search.group(1).strip() if system_search else default_value,
|
|
"PCDF Reference": pcdf_search.group(1) if pcdf_search else default_value,
|
|
"Controls": controls_search.group(1).strip() if controls_search else default_value,
|
|
"% of Heat": int(heat_search.group(1)) if heat_search else 0,
|
|
}
|
|
|
|
def extract_primary_heating(self, text):
|
|
|
|
# Extract Primary Heating Section (Main Heating 1)
|
|
primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
|
|
# We may not have a secondary heating
|
|
primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL)
|
|
primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
|
|
primary_text = primary_heating_section.group(1)
|
|
|
|
return self._extract_heating_details(primary_text)
|
|
|
|
def extract_secondary_heating_details(self, text):
|
|
# Extract Secondary Heating Section (Main Heating 2)
|
|
secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL)
|
|
|
|
output = {}
|
|
if secondary_heating_section is None:
|
|
|
|
output["System"] = ""
|
|
output["PCDF Reference"] = ""
|
|
output["Controls"] = ""
|
|
output["% of Heat"] = 0
|
|
|
|
else:
|
|
secondary_text = secondary_heating_section.group(1)
|
|
output.update(
|
|
**self._extract_heating_details(secondary_text)
|
|
)
|
|
|
|
output["Heating Code"] = (
|
|
re.search(r"Secondary Heating Code\s*(.*?)\n", text).group(1).strip()
|
|
if output["System"] and re.search(r"Secondary Heating Code\s*(.*?)\n", text)
|
|
else ""
|
|
)
|
|
|
|
return output
|
|
|
|
def extract(self):
|
|
"""
|
|
Extracts all relevant data from the EPR PDF.
|
|
|
|
Returns:
|
|
dict: A dictionary containing extracted data, including:
|
|
- Address and Postcode
|
|
- SAP Rating and Primary Energy Use
|
|
- Lighting, Doors, Windows, Roof, and Wall Details
|
|
- Heating systems (Primary and Secondary)
|
|
- Building Parts
|
|
"""
|
|
data = {}
|
|
|
|
with open(self.file_path, "rb") as file:
|
|
reader = PyPDF2.PdfReader(file)
|
|
text = "".join(page.extract_text() for page in reader.pages)
|
|
|
|
data["Assessor Name"] = re.search(r"Created by:\s*(.*?)\n", text).group(1).strip()
|
|
data["Assessment Date"] = re.search(r"\nAssessment Date\s*(.*?)\n", text).group(1).strip()
|
|
|
|
# Extracting individual components
|
|
address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
|
|
if not address_match:
|
|
logger.error("Failed to extract address.")
|
|
raise ValueError("Failed to extract address.")
|
|
data["Address"] = address_match.group(1).strip()
|
|
data["Postcode"] = data["Address"].split(",")[-1].strip()
|
|
|
|
sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
|
|
if not sap_match:
|
|
logger.error("Failed to extract SAP rating.")
|
|
raise ValueError("Failed to extract SAP rating.")
|
|
data["Current SAP Rating"] = int(sap_match.group(1))
|
|
|
|
energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
|
|
if not energy_match:
|
|
logger.error("Failed to extract primary energy use.")
|
|
raise ValueError("Failed to extract primary energy use.")
|
|
data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1))
|
|
|
|
storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
|
|
if not storeys_match:
|
|
logger.error("Failed to extract the number of storeys.")
|
|
raise ValueError("Failed to extract the number of storeys.")
|
|
data["Number of Storeys"] = int(storeys_match.group(1))
|
|
|
|
fuel_match = re.search(r"TOTAL\s*£(\d+)", text)
|
|
if not fuel_match:
|
|
logger.error("Failed to extract fuel bill.")
|
|
raise ValueError("Failed to extract fuel bill.")
|
|
data["Fuel Bill"] = f"£{fuel_match.group(1)}"
|
|
|
|
total_doors_match = re.search(r"Total Doors:\s*(\d+)", text)
|
|
if not total_doors_match:
|
|
logger.error("Failed to extract total doors.")
|
|
raise ValueError("Failed to extract total doors.")
|
|
data["Total Number of Doors"] = int(total_doors_match.group(1))
|
|
|
|
# Extract Number of Insulated Doors
|
|
insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text)
|
|
if not insulated_doors_match:
|
|
logger.error("Failed to extract insulated doors.")
|
|
raise ValueError("Failed to extract insulated doors.")
|
|
data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
|
|
|
|
# Get number of lighting outlets and number of fittings needing LEL
|
|
lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text)
|
|
if not lighting_fittings_match:
|
|
logger.error("Failed to extract lighting.")
|
|
raise ValueError("Failed to extract lighting")
|
|
data["Number of Light Fittings"] = int(lighting_fittings_match.group(1))
|
|
lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text)
|
|
if not lel_fittings_match:
|
|
logger.error("Failed to extract LEL fittings.")
|
|
raise ValueError("Failed to extract LEL fittings.")
|
|
data["Number of LEL Fittings"] = int(lel_fittings_match.group(1))
|
|
data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
|
|
|
|
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
|
|
if not windows_section:
|
|
logger.error("Failed to extract window data.")
|
|
raise ValueError("Failed to extract window data.")
|
|
data["Windows"] = self.extract_window_age_description(windows_section.group(1))
|
|
|
|
data["Primary Heating"] = self.extract_primary_heating(text)
|
|
data["Secondary Heating"] = self.extract_secondary_heating_details(text)
|
|
data["Building Parts"] = self.extract_building_parts(text)
|
|
data["Roof Details"] = self.extract_roof_details(text)
|
|
data["Wall Details"] = self.extract_wall_details(text)
|
|
data["Conservatory"] = self.extract_conservatory(text)
|
|
|
|
water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
|
|
if not water_heating_code_match:
|
|
logger.error("Failed to extract water heating code.")
|
|
raise ValueError("Failed to extract water heating code.")
|
|
data["Water Heating Code"] = water_heating_code_match.group(1).strip()
|
|
|
|
return data
|
|
|
|
|
|
class ElmhurstSummaryReportExtractor:
|
|
"""
|
|
A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR).
|
|
"""
|
|
|
|
def __init__(self, file_path):
|
|
self.file_path = file_path
|
|
|
|
@staticmethod
|
|
def extract_window_age_description(windows_text):
|
|
"""
|
|
Extracts the most common window age description and its proportion.
|
|
|
|
Parameters:
|
|
windows_text (str): The text section containing window data.
|
|
|
|
Returns:
|
|
dict: A dictionary with the most common window age description and its proportion.
|
|
"""
|
|
# Clean up windows_text by removing line breaks for better pattern matching
|
|
windows_text = windows_text.replace("\n", "")
|
|
|
|
# Define possible window age descriptions
|
|
window_descriptions = [
|
|
"Double post or during 2002",
|
|
"Double pre 2002",
|
|
"Double with unknown install date",
|
|
"Secondary glazing",
|
|
"Triple glazing",
|
|
"Single glazing",
|
|
]
|
|
|
|
# Count occurrences of each description
|
|
description_counts = Counter()
|
|
for description in window_descriptions:
|
|
matches = re.findall(re.escape(description), windows_text)
|
|
description_counts[description] = len(matches)
|
|
|
|
if not description_counts or not sum(description_counts.values()):
|
|
raise ValueError("Failed to extract window data.")
|
|
|
|
# Determine the most common description and calculate its proportion
|
|
most_common_description, window_count = description_counts.most_common(1)[0]
|
|
window_proportion = window_count / sum(description_counts.values()) * 100
|
|
|
|
# Get the second most common and the proportion
|
|
if window_proportion == 100:
|
|
second_most_common_description = None
|
|
second_most_common_proportion = 0
|
|
else:
|
|
second_most_common_description, second_window_count = description_counts.most_common(2)[1]
|
|
second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100
|
|
|
|
return {
|
|
"Window Age Description": most_common_description,
|
|
"Window Age Description Proportion (%)": window_proportion,
|
|
"Secondary Window Age Description": second_most_common_description,
|
|
"Secondary Window Age Description Proportion (%)": second_most_common_proportion,
|
|
"Number of Windows": sum(description_counts.values())
|
|
}
|
|
|
|
@staticmethod
|
|
def extract_primary_heating(text):
|
|
primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL)
|
|
primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
|
|
primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
|
|
if primary_heating_section is None:
|
|
raise ValueError("Failed to extract primary heating data.")
|
|
|
|
primary_text = primary_heating_section.group(1)
|
|
|
|
output = {
|
|
'System': re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(1).strip(),
|
|
'PCDF Reference': re.search(r"PCDF boiler Reference\s*(\d+)", primary_text).group(1),
|
|
'Controls': re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(1).strip(),
|
|
'% of Heat': int(re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1))
|
|
}
|
|
return output
|
|
|
|
@staticmethod
|
|
def extract_secondary_heating_details(text):
|
|
secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
|
|
|
|
# Defaults
|
|
output = {
|
|
"System": "",
|
|
"PCDF Reference": "",
|
|
"Controls": "",
|
|
"% of Heat": 0,
|
|
"Heating Code": ""
|
|
}
|
|
if secondary_heating_section is not None:
|
|
# Overwrite defaults
|
|
secondary_text = secondary_heating_section.group(1)
|
|
|
|
main_heating_code_match_secondary = re.search(
|
|
r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
|
|
)
|
|
output["System"] = main_heating_code_match_secondary.group(1).strip()
|
|
output["PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1)
|
|
|
|
second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
|
|
output["Heating Controls"] = (
|
|
second_heating_controls_match.group(1).strip() if second_heating_controls_match else ""
|
|
)
|
|
output["% of Heat"] = int(
|
|
re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1)
|
|
)
|
|
|
|
secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
|
|
if output["System"] != "":
|
|
output["Heating Code"] = (
|
|
secondary_heating_code_match.group(1).strip() if secondary_heating_code_match else ""
|
|
)
|
|
|
|
return output
|
|
|
|
@staticmethod
|
|
def extract_building_parts(text):
|
|
"""
|
|
Extracts building parts and associated dimensions from the summary report PDF.
|
|
This includes Main Property, multiple extensions if they exist, and Room in Roof areas.
|
|
"""
|
|
data = []
|
|
|
|
# Locate the Dimensions section
|
|
dimensions_section = re.search(
|
|
r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
|
|
)
|
|
if not dimensions_section:
|
|
raise ValueError("Failed to locate dimensions section in the text.")
|
|
|
|
dimensions_text = dimensions_section.group(1)
|
|
|
|
# Pattern to extract each building part, starting from Main Property and including extensions
|
|
building_part_pattern = re.compile(
|
|
r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*"
|
|
r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)",
|
|
re.DOTALL
|
|
)
|
|
|
|
# Loop through each building part match, including Main Property and extensions
|
|
for match in building_part_pattern.finditer(dimensions_text):
|
|
part_name = match.group(1)
|
|
floor_data = match.group(2)
|
|
|
|
# Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length
|
|
floor_pattern = re.compile(
|
|
r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
|
|
)
|
|
|
|
# Extract data for each floor within the building part
|
|
for floor_match in floor_pattern.finditer(floor_data):
|
|
floor_level = floor_match.group(1)
|
|
floor_area = float(floor_match.group(2))
|
|
room_height = float(floor_match.group(3))
|
|
perimeter = float(floor_match.group(4))
|
|
party_wall_length = float(floor_match.group(5))
|
|
|
|
# Append to data list
|
|
data.append(
|
|
{
|
|
"Building Part": part_name,
|
|
"Floor Level": floor_level,
|
|
"Floor Area (m2)": floor_area,
|
|
"Room Height (m)": room_height,
|
|
"Perimeter (m)": perimeter,
|
|
"Party Wall Length (m)": party_wall_length
|
|
}
|
|
)
|
|
|
|
# Check specifically for "Room(s) in Roof" entries, which only have Floor Area
|
|
room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)")
|
|
room_in_roof_match = room_in_roof_pattern.search(floor_data)
|
|
if room_in_roof_match:
|
|
floor_area = float(room_in_roof_match.group(1))
|
|
data.append(
|
|
{
|
|
"Building Part": part_name,
|
|
"Floor Level": "Room in Roof",
|
|
"Floor Area (m2)": floor_area,
|
|
"Room Height (m)": None, # Placeholder for missing data
|
|
"Perimeter (m)": None, # Placeholder for missing data
|
|
"Party Wall Length (m)": None # Placeholder for missing data
|
|
}
|
|
)
|
|
|
|
# Calculate aggregated dimensions
|
|
main_property = [part for part in data if "Main Property" in part["Building Part"]]
|
|
first_extensions = [part for part in data if "1st Extension" in part["Building Part"]]
|
|
dimensions = {
|
|
"Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
|
|
"Total Ground Floor Area (m2)": sum(
|
|
[part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]]
|
|
),
|
|
"RIR Floor Area": sum(
|
|
[part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]]
|
|
),
|
|
"Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if
|
|
x["Perimeter (m)"] and x["Room Height (m)"]]),
|
|
"First Extension Wall Area (m2)": sum(
|
|
[x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if
|
|
x["Perimeter (m)"] and x["Room Height (m)"]]
|
|
),
|
|
}
|
|
|
|
return dimensions
|
|
|
|
@staticmethod
|
|
def extract_roof_details(text):
|
|
"""
|
|
Extracts roof type, insulation, and insulation thickness for each building part
|
|
in the 8.0 Roofs section of the summary report.
|
|
"""
|
|
# Define data structure to hold results
|
|
roof_data = []
|
|
|
|
# Locate the entire 8.0 Roofs section
|
|
roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL)
|
|
if not roof_section_match:
|
|
return roof_data # Return empty if no roof section is found
|
|
|
|
# Extract the roof section and append "9.0 Floors:" as the boundary
|
|
roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:"
|
|
|
|
# Define pattern to match each building part's roof entry
|
|
building_part_pattern = re.compile(
|
|
r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label
|
|
r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label,
|
|
# or end
|
|
r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation
|
|
r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness
|
|
re.DOTALL
|
|
)
|
|
|
|
# Extract each building part's data
|
|
for match in building_part_pattern.finditer(roof_section):
|
|
part_name = match.group(1).strip() # Building part label
|
|
roof_type = match.group(2).strip() # Roof Type
|
|
roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation
|
|
roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness
|
|
|
|
# Cleaning to handle annoying cases when it comes out like this:
|
|
# 'A Another dwelling above\n1st Extension'
|
|
if roof_type.startswith("A Another dwelling above"):
|
|
roof_type = "A Another dwelling above"
|
|
|
|
# Store results for this building part
|
|
roof_data.append(
|
|
{
|
|
"Building Part": part_name,
|
|
"Roof Type": roof_type,
|
|
"Roof Insulation": roof_insulation,
|
|
"Roof Insulation Thickness": roof_insulation_thickness,
|
|
}
|
|
)
|
|
|
|
return roof_data
|
|
|
|
@staticmethod
|
|
def extract_wall_details(text):
|
|
"""
|
|
Extracts wall type, insulation, dry-lining, and thickness for each building part,
|
|
including any alternative wall details within the 7.0 Walls section of the summary PDF text.
|
|
"""
|
|
# Define data structure to hold all building part wall entries
|
|
wall_data = []
|
|
|
|
# Locate the entire 7.0 Walls section
|
|
wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1)
|
|
|
|
# Define pattern to match each building part's wall entry within the section
|
|
building_part_pattern = re.compile(
|
|
r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label
|
|
r"Type\s+(.*?)\n" # Matches main wall Type
|
|
r"Insulation\s+(.*?)\n" # Matches main wall Insulation
|
|
r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining
|
|
r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown
|
|
r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness
|
|
re.DOTALL
|
|
)
|
|
|
|
# Define pattern to capture alternative wall details, if present
|
|
alternative_wall_pattern = re.compile(
|
|
r"Alternative Wall Area.*?\n" # Matches start of alternative wall section
|
|
r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type
|
|
r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation
|
|
r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining
|
|
r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown
|
|
r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness
|
|
re.DOTALL
|
|
)
|
|
|
|
# Find all building part entries within the 7.0 Walls section
|
|
for match in building_part_pattern.finditer(wall_section):
|
|
wall_label = match.group(1).strip()
|
|
main_wall_type = match.group(2).strip()
|
|
main_wall_insulation = match.group(3).strip()
|
|
main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A"
|
|
main_wall_thickness_unknown = match.group(6).strip()
|
|
main_wall_thickness = int(match.group(7))
|
|
|
|
# Initialize dictionary for this wall entry
|
|
wall_entry = {
|
|
"Building Part": wall_label,
|
|
"Wall Type": main_wall_type,
|
|
"Wall Insulation": main_wall_insulation,
|
|
"Wall Dry-lining": main_wall_dry_lining,
|
|
"Wall Thickness Unknown": main_wall_thickness_unknown,
|
|
"Wall Thickness (mm)": main_wall_thickness,
|
|
"Alternative Wall Type": None,
|
|
"Alternative Wall Insulation": None,
|
|
"Alternative Wall Dry-lining": "N/A",
|
|
"Alternative Wall Thickness Unknown": None,
|
|
"Alternative Wall Thickness (mm)": None,
|
|
}
|
|
|
|
# Check if there's an alternative wall section following this wall entry
|
|
alt_match = alternative_wall_pattern.search(wall_section, match.end())
|
|
if alt_match:
|
|
wall_entry["Alternative Wall Type"] = alt_match.group(1).strip()
|
|
wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip()
|
|
wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A"
|
|
wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip()
|
|
wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6))
|
|
|
|
# Append each building part as a dictionary in the wall_data list
|
|
wall_data.append(wall_entry)
|
|
|
|
return wall_data
|
|
|
|
def extract(self):
|
|
"""
|
|
Extracts specific data from the provided PDF file.
|
|
Data includes:
|
|
- Current SAP rating
|
|
- Fuel Bill
|
|
- Address
|
|
"""
|
|
|
|
# Expected keys:
|
|
# dict_keys([
|
|
# 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory',
|
|
# 'Water Heating Code'])
|
|
|
|
data = {}
|
|
|
|
with (open(self.file_path, "rb") as file):
|
|
reader = PyPDF2.PdfReader(file)
|
|
text = ""
|
|
for page in reader.pages:
|
|
text += page.extract_text()
|
|
|
|
# Match and extract
|
|
name_match = re.search(r"Name:\s*([A-Za-z\s]+)\s*Title:\s*([A-Za-z\.]+)", text)
|
|
if not name_match:
|
|
raise ValueError("Couldn't extract surveyor name")
|
|
data["Assessor Name"] = name_match.group(2).strip() + " " + name_match.group(1).strip()
|
|
data["Assessment Date"] = re.search(r"Inspection Date:\s*(.*?)\n", text).group(1).strip()
|
|
|
|
# Address and postcode
|
|
postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
|
|
region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
|
|
house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
|
|
house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
|
|
street = re.search(r"Street:\s*(.*?)\nLocality:", text)
|
|
locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
|
|
town = re.search(r"Town:\s*(.*?)\nCounty:", text)
|
|
county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)
|
|
|
|
# Clean extracted values and remove any prefixes
|
|
address_parts = [
|
|
house_no.group(1).strip() if house_no else "",
|
|
house_name.group(1).strip() if house_name else "",
|
|
street.group(1).strip() if street else "",
|
|
locality.group(1).strip() if locality else "",
|
|
town.group(1).strip() if town else "",
|
|
county.group(1).strip() if county else "",
|
|
region.group(1).strip() if region else "",
|
|
postcode.group(1).strip() if postcode else ""
|
|
]
|
|
|
|
# Join non-empty parts with a comma
|
|
data["Address"] = ", ".join([part for part in address_parts if part])
|
|
data["Postcode"] = postcode.group(1).strip()
|
|
|
|
# Extract Current SAP rating
|
|
sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
|
|
if not sap_match:
|
|
raise ValueError("Could not extract SAP rating")
|
|
data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
|
|
|
|
# We don't have primary energy in the summary report
|
|
data['Primary Energy Use Intensity (kWh/m2/yr)'] = None
|
|
|
|
# Number of storeys
|
|
storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
|
|
if not storeys_match:
|
|
raise ValueError("Could not extract number of storeys")
|
|
data["Number of Storeys"] = int(storeys_match.group(1))
|
|
|
|
# Extract Fuel Bill
|
|
fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
|
|
if not fuel_bill_match:
|
|
raise ValueError("Could not extract fuel bill")
|
|
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
|
|
|
|
# Extract Total Number of Doors
|
|
total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text)
|
|
if not total_doors_match:
|
|
raise ValueError("Could not extract total number of doors")
|
|
data["Total Number of Doors"] = int(total_doors_match.group(1))
|
|
|
|
# Extract Number of Insulated Doors
|
|
insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text)
|
|
if not insulated_doors_match:
|
|
raise ValueError("Could not extract number of insulated doors")
|
|
data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
|
|
|
|
# lighting
|
|
data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1))
|
|
data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
|
|
data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
|
|
|
|
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
|
|
if not windows_section:
|
|
raise ValueError("Failed to extract window data.")
|
|
data["Windows"] = self.extract_window_age_description(windows_section.group(1))
|
|
|
|
data["Primary Heating"] = self.extract_primary_heating(text)
|
|
data["Secondary Heating"] = self.extract_secondary_heating_details(text)
|
|
data["Building Parts"] = self.extract_building_parts(text)
|
|
data["Roof Details"] = self.extract_roof_details(text)
|
|
data["Wall Details"] = self.extract_wall_details(text)
|
|
|
|
water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
|
|
if not water_heating_code_match:
|
|
raise ValueError("Failed to extract water heating code.")
|
|
|
|
data["Water Heating Code"] = water_heating_code_match.group(1).strip()
|
|
|
|
# Get the main building wall data
|
|
main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0]
|
|
data["Main Wall Type"] = main_building_walls["Wall Type"]
|
|
data["Main Wall Insulation"] = main_building_walls["Wall Insulation"]
|
|
data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"]
|
|
data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"]
|
|
data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"]
|
|
data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"]
|
|
data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"]
|
|
data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"]
|
|
|
|
return data
|