Model/survey_report/extraction/quidos.py
2025-02-18 19:49:29 +00:00

256 lines
9.7 KiB
Python

import re
class SiteNotesExtractor:
"""
Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report.
"""
def __init__(self, pdf_text):
"""
Initializes the SiteNotesExtractor with the extracted PDF text.
"""
self.text = pdf_text
self.data = {}
def extract_sap_rating(self):
"""
Extracts the current and potential SAP rating from the report.
"""
pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text)
if not pattern:
raise ValueError("No SAP rating found in the report")
self.data.update({
"Current EPC Band": pattern.group(1),
"Current SAP Rating": int(pattern.group(2)),
"Potential EPC Band": pattern.group(3),
"Potential SAP Rating": int(pattern.group(4)),
})
def extract_carbon_emissions(self):
"""
Extracts the current and adjusted annual carbon emissions (TCO2).
"""
pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text)
if not pattern:
raise ValueError("No carbon emissions found in the report")
self.data.update({
"Current Carbon Emissions (TCO2)": float(pattern.group(1)),
})
def extract_building_dimensions(self):
"""
Extracts dimensions for each building part and stores them in a list.
Handles Main Property and multiple extensions.
"""
# Locate the Dimensions section
dimensions_section = re.search(
r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) "
r"Party Wall "
r"Length \(m\)\n"
r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL
)
if not dimensions_section:
raise ValueError("Failed to locate the dimensions section in the text.")
dimensions_text = dimensions_section.group(1)
# Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.)
building_part_pattern = re.compile(
r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
)
building_parts = []
for match in building_part_pattern.finditer(dimensions_text):
to_append = {
"Building Part": match.group(1).strip(),
"Part Floor Area (m2)": float(match.group(2)),
"Room Height (m)": float(match.group(3)),
"Loss Perimeter (m)": float(match.group(4)),
"Party Wall Length (m)": float(match.group(5)),
}
# We calculate the heat loss area
to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"]
building_parts.append(to_append)
if not building_parts:
raise ValueError("No building dimensions found in the report")
self.data["Building Dimensions"] = building_parts
# We calculate some totals
self.data["Total Building Dimensions"] = {
"floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]),
"heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]),
}
def extract_bills_estimate(self):
"""
Extracts the estimated annual energy costs (£) from the report.
"""
pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text)
if not pattern:
raise ValueError("No bills estimate found in the report")
self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", ""))
def extract_all(self):
"""
Runs all extraction methods and returns a dictionary with extracted data.
"""
self.extract_sap_rating()
self.extract_carbon_emissions()
self.extract_bills_estimate()
self.extract_building_dimensions()
# Extract specific measures
# Primary wall
# Secondary wall
# Roof
# Floor
# Heating system
# Hot water system
# Windows
# Doors
# Lighting
# Ventilation
# Solar
return self.data
def extract_walls(self):
"""
Extracts wall type, insulation, dry-lining, and thickness for each building part,
including any alternative wall details within the 7.0 Walls section of the summary PDF text.
"""
text = self.text
wall_data = []
# Isolate the 7.0 Walls section
wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL)
if not wall_section_match:
raise ValueError("Failed to locate the walls section in the text.")
wall_section = wall_section_match.group(1)
# Define patterns to match walls for each building part
wall_pattern = re.compile(
r"(?P<section>Main Property(?: Alternative)?|Extension \d+)\s*\n"
r"(?:Construction\s*(?P<construction>[^\n]*)\n)?"
r"(?:Insulation\s*(?P<insulation>[^\n]*)\n)?"
r"(?:Insulation Thickness\(mm\)\s*(?P<insulation_thickness>[^\n]*)\n)?"
r"(?:Wall Thickness Measured\?\s*(?P<thickness_measured>[^\n]*)\n)?"
r"(?:Wall Thickness\(mm\)\s*(?P<thickness>\d+))?",
re.MULTILINE
)
# TODO: We aren't effectively picking up alternative walls
# alt_wall_pattern = re.compile(
# r"Alternative Wall Sheltered\s*.*?\n"
# r".*?Construction\s*(?P<alt_construction>[^\n]*)\n"
# r"Insulation\s*(?P<alt_insulation>[^\n]*)\n"
# r"Insulation Thickness\(mm\)\s*(?P<alt_insulation_thickness>[^\n]*)\n"
# r"Wall Thickness Measured\?\s*(?P<alt_thickness_measured>[^\n]*)\n"
# r"Wall Thickness\(mm\)\s*(?P<alt_thickness>\d+)?",
# re.MULTILINE
# )
for match in wall_pattern.finditer(wall_section):
building_part = match.group("section")
# has_alternative_wall = "Alternative" in building_part
building_part = "Main Property" if "Main Property" in building_part else building_part
wall_entry = {
"Building Part": building_part,
"Wall Type": match.group("construction") or "Unknown",
"Wall Insulation": match.group("insulation") or "Unknown",
"Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown",
"Wall Thickness Measured": match.group("thickness_measured") or "Unknown",
"Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group(
"thickness").isdigit() else None,
"Alternative Wall Type": None,
"Alternative Wall Insulation": None,
"Alternative Insulation Thickness (mm)": None,
"Alternative Wall Thickness Measured": None,
"Alternative Wall Thickness (mm)": None,
}
# Check if an alternative wall section exists
# if has_alternative_wall:
# alt_match = alt_wall_pattern.search(wall_section, match.end())
# if alt_match:
# wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown"
# wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown"
# wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group(
# "alt_insulation_thickness") or "Unknown"
# wall_entry["Alternative Wall Thickness Measured"] = alt_match.group(
# "alt_thickness_measured") or "Unknown"
# wall_entry["Alternative Wall Thickness (mm)"] = int(
# alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group(
# "alt_thickness").isdigit() else None
wall_data.append(wall_entry)
return wall_data
class EPRExtractor:
"""
Extracts space heating, water heating, and address from an Energy Performance Report (EPR).
"""
def __init__(self, pdf_text):
"""
Initializes the EPRExtractor with the extracted PDF text.
"""
self.text = pdf_text
self.data = {}
def extract_heating_consumption(self):
"""
Extracts space heating and water heating values from the report.
"""
pattern = re.search(
r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)",
self.text,
re.DOTALL
)
if not pattern:
raise ValueError("No heating data found in the report")
self.data.update({
"Space Heating (KWH)": int(pattern.group(1).replace(",", "")),
"Water Heating (KWH)": int(pattern.group(2).replace(",", ""))
})
def extract_address(self):
"""
Extracts the full address from the report.
"""
pattern = re.search(
r"Address\s*(.*?)\nTown\s*(.*?)\n",
self.text,
re.DOTALL
)
if not pattern:
raise ValueError("No address found in the report")
full_address = pattern.group(1).strip()
self.data["Address"] = full_address
def extract_all(self):
"""
Runs all extraction methods and returns a dictionary with extracted data.
"""
self.extract_address()
self.extract_heating_consumption()
return self.data