mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
256 lines
9.7 KiB
Python
256 lines
9.7 KiB
Python
import re
|
|
|
|
|
|
class SiteNotesExtractor:
|
|
"""
|
|
Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report.
|
|
"""
|
|
|
|
def __init__(self, pdf_text):
|
|
"""
|
|
Initializes the SiteNotesExtractor with the extracted PDF text.
|
|
"""
|
|
self.text = pdf_text
|
|
self.data = {}
|
|
|
|
def extract_sap_rating(self):
|
|
"""
|
|
Extracts the current and potential SAP rating from the report.
|
|
"""
|
|
pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text)
|
|
|
|
if not pattern:
|
|
raise ValueError("No SAP rating found in the report")
|
|
|
|
self.data.update({
|
|
"Current EPC Band": pattern.group(1),
|
|
"Current SAP Rating": int(pattern.group(2)),
|
|
"Potential EPC Band": pattern.group(3),
|
|
"Potential SAP Rating": int(pattern.group(4)),
|
|
})
|
|
|
|
def extract_carbon_emissions(self):
|
|
"""
|
|
Extracts the current and adjusted annual carbon emissions (TCO2).
|
|
"""
|
|
pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text)
|
|
|
|
if not pattern:
|
|
raise ValueError("No carbon emissions found in the report")
|
|
|
|
self.data.update({
|
|
"Current Carbon Emissions (TCO2)": float(pattern.group(1)),
|
|
})
|
|
|
|
def extract_building_dimensions(self):
|
|
"""
|
|
Extracts dimensions for each building part and stores them in a list.
|
|
Handles Main Property and multiple extensions.
|
|
"""
|
|
|
|
# Locate the Dimensions section
|
|
dimensions_section = re.search(
|
|
r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) "
|
|
r"Party Wall "
|
|
r"Length \(m\)\n"
|
|
r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL
|
|
)
|
|
|
|
if not dimensions_section:
|
|
raise ValueError("Failed to locate the dimensions section in the text.")
|
|
|
|
dimensions_text = dimensions_section.group(1)
|
|
|
|
# Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.)
|
|
building_part_pattern = re.compile(
|
|
r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
|
|
)
|
|
|
|
building_parts = []
|
|
for match in building_part_pattern.finditer(dimensions_text):
|
|
to_append = {
|
|
"Building Part": match.group(1).strip(),
|
|
"Part Floor Area (m2)": float(match.group(2)),
|
|
"Room Height (m)": float(match.group(3)),
|
|
"Loss Perimeter (m)": float(match.group(4)),
|
|
"Party Wall Length (m)": float(match.group(5)),
|
|
}
|
|
# We calculate the heat loss area
|
|
to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"]
|
|
building_parts.append(to_append)
|
|
|
|
if not building_parts:
|
|
raise ValueError("No building dimensions found in the report")
|
|
|
|
self.data["Building Dimensions"] = building_parts
|
|
# We calculate some totals
|
|
self.data["Total Building Dimensions"] = {
|
|
"floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]),
|
|
"heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]),
|
|
}
|
|
|
|
def extract_bills_estimate(self):
|
|
"""
|
|
Extracts the estimated annual energy costs (£) from the report.
|
|
"""
|
|
pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text)
|
|
|
|
if not pattern:
|
|
raise ValueError("No bills estimate found in the report")
|
|
|
|
self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", ""))
|
|
|
|
def extract_all(self):
|
|
"""
|
|
Runs all extraction methods and returns a dictionary with extracted data.
|
|
"""
|
|
self.extract_sap_rating()
|
|
self.extract_carbon_emissions()
|
|
self.extract_bills_estimate()
|
|
self.extract_building_dimensions()
|
|
|
|
# Extract specific measures
|
|
# Primary wall
|
|
# Secondary wall
|
|
# Roof
|
|
# Floor
|
|
# Heating system
|
|
# Hot water system
|
|
# Windows
|
|
# Doors
|
|
# Lighting
|
|
# Ventilation
|
|
# Solar
|
|
|
|
return self.data
|
|
|
|
def extract_walls(self):
|
|
"""
|
|
Extracts wall type, insulation, dry-lining, and thickness for each building part,
|
|
including any alternative wall details within the 7.0 Walls section of the summary PDF text.
|
|
"""
|
|
|
|
text = self.text
|
|
wall_data = []
|
|
|
|
# Isolate the 7.0 Walls section
|
|
wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL)
|
|
if not wall_section_match:
|
|
raise ValueError("Failed to locate the walls section in the text.")
|
|
|
|
wall_section = wall_section_match.group(1)
|
|
|
|
# Define patterns to match walls for each building part
|
|
wall_pattern = re.compile(
|
|
r"(?P<section>Main Property(?: Alternative)?|Extension \d+)\s*\n"
|
|
r"(?:Construction\s*(?P<construction>[^\n]*)\n)?"
|
|
r"(?:Insulation\s*(?P<insulation>[^\n]*)\n)?"
|
|
r"(?:Insulation Thickness\(mm\)\s*(?P<insulation_thickness>[^\n]*)\n)?"
|
|
r"(?:Wall Thickness Measured\?\s*(?P<thickness_measured>[^\n]*)\n)?"
|
|
r"(?:Wall Thickness\(mm\)\s*(?P<thickness>\d+))?",
|
|
re.MULTILINE
|
|
)
|
|
|
|
# TODO: We aren't effectively picking up alternative walls
|
|
# alt_wall_pattern = re.compile(
|
|
# r"Alternative Wall Sheltered\s*.*?\n"
|
|
# r".*?Construction\s*(?P<alt_construction>[^\n]*)\n"
|
|
# r"Insulation\s*(?P<alt_insulation>[^\n]*)\n"
|
|
# r"Insulation Thickness\(mm\)\s*(?P<alt_insulation_thickness>[^\n]*)\n"
|
|
# r"Wall Thickness Measured\?\s*(?P<alt_thickness_measured>[^\n]*)\n"
|
|
# r"Wall Thickness\(mm\)\s*(?P<alt_thickness>\d+)?",
|
|
# re.MULTILINE
|
|
# )
|
|
|
|
for match in wall_pattern.finditer(wall_section):
|
|
building_part = match.group("section")
|
|
# has_alternative_wall = "Alternative" in building_part
|
|
building_part = "Main Property" if "Main Property" in building_part else building_part
|
|
|
|
wall_entry = {
|
|
"Building Part": building_part,
|
|
"Wall Type": match.group("construction") or "Unknown",
|
|
"Wall Insulation": match.group("insulation") or "Unknown",
|
|
"Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown",
|
|
"Wall Thickness Measured": match.group("thickness_measured") or "Unknown",
|
|
"Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group(
|
|
"thickness").isdigit() else None,
|
|
"Alternative Wall Type": None,
|
|
"Alternative Wall Insulation": None,
|
|
"Alternative Insulation Thickness (mm)": None,
|
|
"Alternative Wall Thickness Measured": None,
|
|
"Alternative Wall Thickness (mm)": None,
|
|
}
|
|
|
|
# Check if an alternative wall section exists
|
|
# if has_alternative_wall:
|
|
# alt_match = alt_wall_pattern.search(wall_section, match.end())
|
|
# if alt_match:
|
|
# wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown"
|
|
# wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown"
|
|
# wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group(
|
|
# "alt_insulation_thickness") or "Unknown"
|
|
# wall_entry["Alternative Wall Thickness Measured"] = alt_match.group(
|
|
# "alt_thickness_measured") or "Unknown"
|
|
# wall_entry["Alternative Wall Thickness (mm)"] = int(
|
|
# alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group(
|
|
# "alt_thickness").isdigit() else None
|
|
|
|
wall_data.append(wall_entry)
|
|
|
|
return wall_data
|
|
|
|
|
|
class EPRExtractor:
|
|
"""
|
|
Extracts space heating, water heating, and address from an Energy Performance Report (EPR).
|
|
"""
|
|
|
|
def __init__(self, pdf_text):
|
|
"""
|
|
Initializes the EPRExtractor with the extracted PDF text.
|
|
"""
|
|
self.text = pdf_text
|
|
self.data = {}
|
|
|
|
def extract_heating_consumption(self):
|
|
"""
|
|
Extracts space heating and water heating values from the report.
|
|
"""
|
|
pattern = re.search(
|
|
r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)",
|
|
self.text,
|
|
re.DOTALL
|
|
)
|
|
|
|
if not pattern:
|
|
raise ValueError("No heating data found in the report")
|
|
|
|
self.data.update({
|
|
"Space Heating (KWH)": int(pattern.group(1).replace(",", "")),
|
|
"Water Heating (KWH)": int(pattern.group(2).replace(",", ""))
|
|
})
|
|
|
|
def extract_address(self):
|
|
"""
|
|
Extracts the full address from the report.
|
|
"""
|
|
pattern = re.search(
|
|
r"Address\s*(.*?)\nTown\s*(.*?)\n",
|
|
self.text,
|
|
re.DOTALL
|
|
)
|
|
|
|
if not pattern:
|
|
raise ValueError("No address found in the report")
|
|
|
|
full_address = pattern.group(1).strip()
|
|
self.data["Address"] = full_address
|
|
|
|
def extract_all(self):
|
|
"""
|
|
Runs all extraction methods and returns a dictionary with extracted data.
|
|
"""
|
|
self.extract_address()
|
|
self.extract_heating_consumption()
|
|
return self.data
|