extracting epr

This commit is contained in:
Khalim Conn-Kowlessar 2025-01-30 01:09:41 +00:00
parent 32b053e7db
commit daabf2a586
3 changed files with 71 additions and 7 deletions

View file

@ -1,7 +1,7 @@
import os
import PyPDF2
from survey_report.extraction.detect_report_type import detect_report_type
from survey_report.extraction.quidos import SiteNotesExtractor
from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor
def handle():
@ -33,12 +33,18 @@ def handle():
if report_type is not None:
file_mapping[report_type] = text
# Check the report type
report_type = detect_report_type(os.path.join(data_folder, file))
# This is only set up to work with quido site notes so we must have it
if "quidos_site_notes" not in file_mapping:
raise ValueError("No quidos site notes found")
site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
site_notes = site_notes_extractor.extract_all()
# We also must have an EPR
epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
epr = epr_extractor.extract_all()
# We now produce the combined data sheet which is the starting figure:
data_sheet = {**epr, **site_notes}
del data_sheet['Building Dimensions']
# We unnest the Total Building Dimensions
data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
del data_sheet["Total Building Dimensions"]

View file

@ -16,4 +16,7 @@ def detect_report_type(first_page):
):
return "quidos_site_notes"
if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page):
return "quidos_epr"
return None

View file

@ -109,3 +109,58 @@ class SiteNotesExtractor:
self.extract_bills_estimate()
self.extract_building_dimensions()
return self.data
class EPRExtractor:
"""
Extracts space heating, water heating, and address from an Energy Performance Report (EPR).
"""
def __init__(self, pdf_text):
"""
Initializes the EPRExtractor with the extracted PDF text.
"""
self.text = pdf_text
self.data = {}
def extract_heating_data(self):
"""
Extracts space heating and water heating values from the report.
"""
pattern = re.search(
r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)",
self.text,
re.DOTALL
)
if not pattern:
raise ValueError("No heating data found in the report")
self.data.update({
"Space Heating (KWH)": int(pattern.group(1).replace(",", "")),
"Water Heating (KWH)": int(pattern.group(2).replace(",", ""))
})
def extract_address(self):
"""
Extracts the full address from the report.
"""
pattern = re.search(
r"Address\s*(.*?)\nTown\s*(.*?)\n",
self.text,
re.DOTALL
)
if not pattern:
raise ValueError("No address found in the report")
full_address = pattern.group(1).strip()
self.data["Address"] = full_address
def extract_all(self):
"""
Runs all extraction methods and returns a dictionary with extracted data.
"""
self.extract_address()
self.extract_heating_data()
return self.data