extracting epr

2026-07-27 23:35:01 +00:00 · 2025-01-30 01:09:41 +00:00 · 2025-01-30 01:09:41 +00:00 · daabf2a586
commit daabf2a586
parent 32b053e7db
3 changed files with 71 additions and 7 deletions
--- a/survey_report/app.py
+++ b/survey_report/app.py
@ -1,7 +1,7 @@
 import os
 import PyPDF2
 from survey_report.extraction.detect_report_type import detect_report_type
-from survey_report.extraction.quidos import SiteNotesExtractor
+from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor


 def handle():
@ -33,12 +33,18 @@ def handle():
        if report_type is not None:
            file_mapping[report_type] = text

-        # Check the report type
-        report_type = detect_report_type(os.path.join(data_folder, file))
-
    # This is only set up to work with quido site notes so we must have it
-    if "quidos_site_notes" not in file_mapping:
-        raise ValueError("No quidos site notes found")
-
    site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
    site_notes = site_notes_extractor.extract_all()
+
+    # We also must have an EPR
+    epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
+    epr = epr_extractor.extract_all()
+
+    # We now produce the combined data sheet which is the starting figure:
+    data_sheet = {**epr, **site_notes}
+    del data_sheet['Building Dimensions']
+    # We unnest the Total Building Dimensions
+    data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
+    data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
+    del data_sheet["Total Building Dimensions"]
--- a/survey_report/extraction/detect_report_type.py
+++ b/survey_report/extraction/detect_report_type.py
@ -16,4 +16,7 @@ def detect_report_type(first_page):
    ):
        return "quidos_site_notes"

+    if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page):
+        return "quidos_epr"
+
    return None
--- a/survey_report/extraction/quidos.py
+++ b/survey_report/extraction/quidos.py
@ -109,3 +109,58 @@ class SiteNotesExtractor:
        self.extract_bills_estimate()
        self.extract_building_dimensions()
        return self.data
+
+
+class EPRExtractor:
+    """
+    Extracts space heating, water heating, and address from an Energy Performance Report (EPR).
+    """
+
+    def __init__(self, pdf_text):
+        """
+        Initializes the EPRExtractor with the extracted PDF text.
+        """
+        self.text = pdf_text
+        self.data = {}
+
+    def extract_heating_data(self):
+        """
+        Extracts space heating and water heating values from the report.
+        """
+        pattern = re.search(
+            r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)",
+            self.text,
+            re.DOTALL
+        )
+
+        if not pattern:
+            raise ValueError("No heating data found in the report")
+
+        self.data.update({
+            "Space Heating (KWH)": int(pattern.group(1).replace(",", "")),
+            "Water Heating (KWH)": int(pattern.group(2).replace(",", ""))
+        })
+
+    def extract_address(self):
+        """
+        Extracts the full address from the report.
+        """
+        pattern = re.search(
+            r"Address\s*(.*?)\nTown\s*(.*?)\n",
+            self.text,
+            re.DOTALL
+        )
+
+        if not pattern:
+            raise ValueError("No address found in the report")
+
+        full_address = pattern.group(1).strip()
+        self.data["Address"] = full_address
+
+    def extract_all(self):
+        """
+        Runs all extraction methods and returns a dictionary with extracted data.
+        """
+        self.extract_address()
+        self.extract_heating_data()
+        return self.data