debugging extract epr for old elmhurst epr

2026-07-27 23:35:01 +00:00 · 2025-01-28 22:15:53 +00:00 · 2025-01-28 22:15:53 +00:00 · ca7a0e9d10
commit ca7a0e9d10
parent 86deed8115
2 changed files with 34 additions and 13 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -747,12 +747,30 @@ def extract_epr(pdf_path):

        # Extract Current and Potential SAP ratings
        sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
-        current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
-        data["Current SAP Rating"] = current_sap
+        if sap_match is None:
+            # Handles the older format of the elmhurst EPR
+            # The text will look something like this:
+            # Least energy efficient - higher running costsD 61 - we extract D 61
+            sap_match = re.search(
+                r"(?P<current_epc>[A-G])\s(?P<current_sap>\d{1,3})(?P<potential_epc>[A-G])\s(?P<potential_sap>\d{1,3})",
+                text)
+            data["Current EPC Band"] = sap_match.group("current_epc")
+            data["Current SAP Rating"] = int(sap_match.group("current_sap"))
+        else:
+            current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
+            data["Current SAP Rating"] = current_sap

        # Extract the primary energy use intensity
        additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
-        data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
+        if additional_rating_match:
+            data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
+        else:
+            # Handles the older format of the Elmhurst EPR
+            primary_energy_match = re.search(r"actual consumption\.\n(?P<primary_energy>\d+)", text)
+            data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy"))
+            # We calculate the primary energy use intensity by dividing by floor area
+            floor_area = re.search(r"Total Floor Area\s(?P<floor_area>\d+)\s?m2", text).group("floor_area")
+            data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area)

        # Extract Number of Storeys
        storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
@ -2983,8 +3001,13 @@ def revised_model():
    # We now do a large pull of all of the data
    extracted_data = []
    for survey_folder in tqdm(survey_folders):
+
        survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)

+        # Check that the survey folder is actually a folder
+        if not os.path.isdir(survey_folder_path):
+            continue
+
        # List the folders inside of the survey folder
        survey_subfolders = [
            name for name in os.listdir(survey_folder_path)
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@ -162,19 +162,17 @@ def app():
    Property UPRN

    """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern"
-    DATA_FILENAME = "January 2025 Additions Query.xlsx"
-    SHEET_NAME = "Jan 2025 additions"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing"
+    DATA_FILENAME = "For Housing Data pull.xlsx"
+    SHEET_NAME = "Sheet1"
    POSTCODE_COLUMN = "Post Code"
-    FULLADDRESS_COLUMN = "Street / Block Name"
-    ADDRESS1_COLUMN = None
-    ADDRESS1_METHOD = "first_word"
-    ADDRESS_COLS_TO_CONCAT = []
+    FULLADDRESS_COLUMN = None
+    ADDRESS1_COLUMN = "NO."
+    ADDRESS1_METHOD = None
+    ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"]

    # Maps addresses to uprn in problematic cases
-    MANUAL_UPRN_MAP = {
-        "Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560
-    }
+    MANUAL_UPRN_MAP = {}

    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
    asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()