diff --git a/survey_report/app.py b/survey_report/app.py index 825a3658..f59c9984 100644 --- a/survey_report/app.py +++ b/survey_report/app.py @@ -1,7 +1,7 @@ import os import PyPDF2 from survey_report.extraction.detect_report_type import detect_report_type -from survey_report.extraction.quidos import SiteNotesExtractor +from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor def handle(): @@ -33,12 +33,18 @@ def handle(): if report_type is not None: file_mapping[report_type] = text - # Check the report type - report_type = detect_report_type(os.path.join(data_folder, file)) - # This is only set up to work with quido site notes so we must have it - if "quidos_site_notes" not in file_mapping: - raise ValueError("No quidos site notes found") - site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"]) site_notes = site_notes_extractor.extract_all() + + # We also must have an EPR + epr_extractor = EPRExtractor(file_mapping["quidos_epr"]) + epr = epr_extractor.extract_all() + + # We now produce the combined data sheet which is the starting figure: + data_sheet = {**epr, **site_notes} + del data_sheet['Building Dimensions'] + # We unnest the Total Building Dimensions + data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + del data_sheet["Total Building Dimensions"] diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py index fe1600e7..434a3fb4 100644 --- a/survey_report/extraction/detect_report_type.py +++ b/survey_report/extraction/detect_report_type.py @@ -16,4 +16,7 @@ def detect_report_type(first_page): ): return "quidos_site_notes" + if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page): + return "quidos_epr" + return None diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py index ae66dd0d..374df084 100644 --- a/survey_report/extraction/quidos.py +++ b/survey_report/extraction/quidos.py @@ -109,3 +109,58 @@ class SiteNotesExtractor: self.extract_bills_estimate() self.extract_building_dimensions() return self.data + + +class EPRExtractor: + """ + Extracts space heating, water heating, and address from an Energy Performance Report (EPR). + """ + + def __init__(self, pdf_text): + """ + Initializes the EPRExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_heating_data(self): + """ + Extracts space heating and water heating values from the report. + """ + pattern = re.search( + r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No heating data found in the report") + + self.data.update({ + "Space Heating (KWH)": int(pattern.group(1).replace(",", "")), + "Water Heating (KWH)": int(pattern.group(2).replace(",", "")) + }) + + def extract_address(self): + """ + Extracts the full address from the report. + """ + pattern = re.search( + r"Address\s*(.*?)\nTown\s*(.*?)\n", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No address found in the report") + + full_address = pattern.group(1).strip() + self.data["Address"] = full_address + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_address() + self.extract_heating_data() + return self.data