implementing summary report extraction

2026-06-08 11:17:27 +00:00 · 2024-11-28 12:00:43 +00:00 · 2024-11-28 12:00:43 +00:00 · 5a2ffe646c
commit 5a2ffe646c
parent 8b875cbccf
2 changed files with 71 additions and 31 deletions
--- a/etl/lodgement/app.py
+++ b/etl/lodgement/app.py
@ -170,8 +170,8 @@ def handler():

            epr_to_insert = {
                "Postcode": extracted_contents["elmhurst epr"]["Postcode"],
-                "City/County": None,
-                "District/Town": None,
+                "City/County": extracted_contents["elmhurst epr"]["County"],
+                "District/Town": extracted_contents["elmhurst epr"]["Town"],
                "Local Authority": None,
                'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"],
                'Pre Heat Transfer': pre_heat_transfer,
@ -207,6 +207,35 @@ def handler():
                cr_to_insert
            )

+        if extracted_contents.get("elmhurst summary report"):
+            total_floor_area = sum(
+                [x["Floor Area (m2)"] for x in extracted_contents["elmhurst summary report"]["Building Parts"]] +
+                # Get the conservatory floor area
+                [extracted_contents["elmhurst summary report"]["Conservatory"]["Conservatory Floor Area"]]
+            )
+
+            pre_heat_transfer = (
+                extracted_contents["elmhurst summary report"]["Primary Energy Use Intensity (kWh/m2/yr)"]
+            )
+            pre_heat_demand = None  # Don't have this
+
+            summary_to_insert = {
+                "Postcode": extracted_contents["elmhurst summary report"]["Postcode"],
+                "City/County": extracted_contents["elmhurst summary report"]["County"],
+                "District/Town": extracted_contents["elmhurst summary report"]["Town"],
+                'SAP Rating Pre (from IMA)': extracted_contents["elmhurst summary report"]["Current SAP Rating"],
+                'Pre Heat Transfer': pre_heat_transfer,
+                'Pre Total Floor Area': total_floor_area,
+                'Pre Heat Demand': pre_heat_demand,
+                "R. Assessor - Name": extracted_contents["elmhurst summary report"]["Assessor Name"],
+                "Retrofit Assessment Date": extracted_contents["elmhurst summary report"]["Assessment Date"],
+            }
+
+            update_dictionary_with_check(
+                output_row_data,
+                summary_to_insert
+            )
+
        extracted.append(output_row_data)

    extracted_df = pd.DataFrame(extracted)
--- a/utils/file_data_extraction.py
+++ b/utils/file_data_extraction.py
@ -398,6 +398,15 @@ class ElmhurstEprExtractor:
        data["Address"] = address_match.group(1).strip()
        data["Postcode"] = data["Address"].split(",")[-1].strip()

+        # TODO:
+        data["Region"] = None
+        data["House Name"] = None
+        data["House No"] = None
+        data["Street"] = None
+        data["Locality"] = None
+        data["Town"] = None
+        data["County"] = None
+
        sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
        if not sap_match:
            logger.error("Failed to extract SAP rating.")
@ -657,26 +666,7 @@ class ElmhurstSummaryReportExtractor:
                    }
                )

-        # Calculate aggregated dimensions
-        main_property = [part for part in data if "Main Property" in part["Building Part"]]
-        first_extensions = [part for part in data if "1st Extension" in part["Building Part"]]
-        dimensions = {
-            "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
-            "Total Ground Floor Area (m2)": sum(
-                [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]]
-            ),
-            "RIR Floor Area": sum(
-                [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]]
-            ),
-            "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if
-                                                 x["Perimeter (m)"] and x["Room Height (m)"]]),
-            "First Extension Wall Area (m2)": sum(
-                [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if
-                 x["Perimeter (m)"] and x["Room Height (m)"]]
-            ),
-        }
-
-        return dimensions
+        return data

    @staticmethod
    def extract_roof_details(text):
@ -869,7 +859,6 @@ class ElmhurstSummaryReportExtractor:
        """

        data = {}
-
        with (open(self.file_path, "rb") as file):
            reader = PyPDF2.PdfReader(file)
            text = ""
@ -885,29 +874,51 @@ class ElmhurstSummaryReportExtractor:

            # Address and postcode
            postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
+            postcode = postcode.group(1).strip() if postcode else ""
+
            region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
+            region = region.group(1).strip() if region else ""
+
            house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
+            house_name = house_name.group(1).strip() if house_name else ""
+
            house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
+            house_no = house_no.group(1).strip() if house_no else ""
+
            street = re.search(r"Street:\s*(.*?)\nLocality:", text)
+            street = street.group(1).strip() if street else ""
+
            locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
+            locality = locality.group(1).strip() if locality else ""
+
            town = re.search(r"Town:\s*(.*?)\nCounty:", text)
+            town = town.group(1).strip() if town else ""
+
            county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)
+            county = county.group(1).strip() if county else ""

            # Clean extracted values and remove any prefixes
            address_parts = [
-                house_no.group(1).strip() if house_no else "",
-                house_name.group(1).strip() if house_name else "",
-                street.group(1).strip() if street else "",
-                locality.group(1).strip() if locality else "",
-                town.group(1).strip() if town else "",
-                county.group(1).strip() if county else "",
-                region.group(1).strip() if region else "",
-                postcode.group(1).strip() if postcode else ""
+                house_no,
+                house_name,
+                street,
+                locality,
+                town,
+                county,
+                region,
+                postcode
            ]

            # Join non-empty parts with a comma
            data["Address"] = ", ".join([part for part in address_parts if part])
            data["Postcode"] = postcode.group(1).strip()
+            data["Region"] = region
+            data["House Name"] = house_name
+            data["House No"] = house_no
+            data["Street"] = street
+            data["Locality"] = locality
+            data["Town"] = town
+            data["County"] = county

            # Extract Current SAP rating
            sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)