From 11a4bc24a1903f4f384aef48fd006ca8c17c28e8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 28 Jan 2025 17:42:19 +0000
Subject: [PATCH 01/72] anonymised sharepoint keys

---
 .../panacap_ventures/sample_remote_assessments.py |  1 +
 etl/customers/stonewater/Wave 3 Preparation.py    |  1 -
 etl/customers/stonewater/data_cleaning.py         | 15 ++++++++++-----
 3 files changed, 11 insertions(+), 6 deletions(-)
 create mode 100644 etl/customers/panacap_ventures/sample_remote_assessments.py

diff --git a/etl/customers/panacap_ventures/sample_remote_assessments.py b/etl/customers/panacap_ventures/sample_remote_assessments.py
new file mode 100644
index 00000000..1a5ddff7
--- /dev/null
+++ b/etl/customers/panacap_ventures/sample_remote_assessments.py
@@ -0,0 +1 @@
+# The address we're looking from for the remote assessments is Natwest House, Shenley Rd, Borehamwood WD6 1DL
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 8538188b..b1bf0638 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2936,7 +2936,6 @@ def revised_model():
     missed = original_archetypes[
         ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values)
     ]["Archetype ID"].unique()
-    assert
 
 # if __name__ == "__main__":
 #     main()
diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py
index 8751960c..7ee06fcd 100644
--- a/etl/customers/stonewater/data_cleaning.py
+++ b/etl/customers/stonewater/data_cleaning.py
@@ -1,6 +1,7 @@
 import os
 import shutil
 from tqdm import tqdm
+from etl.access_reporting.app import SharePointClient
 
 
 def delete_large_files():
@@ -66,13 +67,17 @@ def delete_large_files():
 def download_data_from_sharepoint():
     # Given a sharepoint location, this function will download the retrofit assessment folders from the locations
     # specified in the sharepoint location
-    from etl.access_reporting.app import SharePointClient
+
+    SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None)
+    SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
+    SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None)
+    OSMOSIS_SHAREPOINT_SITE_ID = os.getenv("OSMOSIS_SHAREPOINT_SITE_ID", None)
 
     sharepoint_client = SharePointClient(
-        tenant_id="10d5af8b-2cfd-4882-9ccd-b96e4812dacf",
-        client_id="6832a4c5-fb8c-4082-a746-4f51e1020f0d",
-        client_secret="xpC8Q~Frww48SM1V-D8lGy5iOY7P_cJ7FF3jgarQ",
-        site_id="bc925a9a-ad0b-4de9-9a3c-e61014cc7489"
+        tenant_id=SHAREPOINT_TENANT_ID,
+        client_id=SHAREPOINT_CLIENT_ID,
+        client_secret=SHAREPOINT_CLIENT_SECRET,
+        site_id=OSMOSIS_SHAREPOINT_SITE_ID
     )
 
     # Retrieve the data from Sharepoint and write to local machine

From 86deed8115c8b630ca5516f113ec5beb585460e0 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 28 Jan 2025 18:13:07 +0000
Subject: [PATCH 02/72] setting up the stonewater assessment extraction process

---
 .../sample_remote_assessments.py              |   1 -
 .../stonewater/Wave 3 Preparation.py          | 116 +++++++++++++++++-
 2 files changed, 112 insertions(+), 5 deletions(-)
 delete mode 100644 etl/customers/panacap_ventures/sample_remote_assessments.py

diff --git a/etl/customers/panacap_ventures/sample_remote_assessments.py b/etl/customers/panacap_ventures/sample_remote_assessments.py
deleted file mode 100644
index 1a5ddff7..00000000
--- a/etl/customers/panacap_ventures/sample_remote_assessments.py
+++ /dev/null
@@ -1 +0,0 @@
-# The address we're looking from for the remote assessments is Natwest House, Shenley Rd, Borehamwood WD6 1DL
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index b1bf0638..105628e9 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2928,14 +2928,122 @@ def revised_model():
     original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
     original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
 
-    original_archetypes = original_archetypes[
-        ["Address ID", "Archetype ID", ""]
-    ]
-
     # Check if we have all of the addresses
     missed = original_archetypes[
         ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values)
     ]["Archetype ID"].unique()
 
+    assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'}
+
+    original_archetypes = original_archetypes[
+        ["Address ID", "Archetype ID", "Archetype Group Rank"]
+    ]
+
+    # Merge these archetypes on to the new priority postcodes
+    new_priority_postcodes = new_priority_postcodes.merge(
+        original_archetypes, how="left", on="Address ID"
+    )
+
+    # Basic check, should have no rows with missing Archetype ID, where
+    assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin(
+        original_archetypes["Address ID"]
+    ).sum()) == 0
+
+    # We pull together the survey data sheet
+    survey_folders = []
+
+    # Loop over each survey folder and list its contents
+    for i in range(1, NUM_FOLDERS + 1):
+        folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
+        if os.path.isdir(folder_path):  # Check if folder exists
+            folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
+            survey_folders.extend(folder_contents)  # Append contents to the master list
+
+    wave_21_folders = [
+        "1. Herefordshire",
+        "2. Bedfordshire",
+        "3. Wiltshire",
+        "4. Bournemouth",
+        "5. Coventry",
+        "6. West Sussex",
+        "7. Dorset",
+        "8. Cambridgeshire",
+        "9. Guildford",
+        "10. Little Island",
+        "11. CCS Dorset"
+    ]
+
+    for wave_2_1_folder in wave_21_folders:
+        folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder)
+        if os.path.isdir(folder_path):  # Check if folder exists
+            folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in
+                               os.listdir(folder_path)]
+            survey_folders.extend(folder_contents)  # Append contents to the master list
+
+    # We now do a large pull of all of the data
+    extracted_data = []
+    for survey_folder in tqdm(survey_folders):
+        survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
+
+        # List the folders inside of the survey folder
+        survey_subfolders = [
+            name for name in os.listdir(survey_folder_path)
+            if os.path.isdir(os.path.join(survey_folder_path, name))
+        ]
+
+        # Check if there's a "retrofit assessment" folder
+        retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
+
+        ra_folder = next(
+            (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
+            None
+        )
+
+        # If retrofit assessment folder exists, check if it has content
+        if retrofit_folder or ra_folder:
+            if retrofit_folder:
+                retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+            else:
+                retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
+
+            # Check if everything inside is a sub-folder and the number of folders is 2
+            items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store']
+            all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items]
+            if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items:
+                # Get the folder that isn't Property Pics
+                retrofit_folder_path = os.path.join(
+                    retrofit_folder_path, [item for item in items if item != "Property Pics"][0]
+                )
+
+            if os.listdir(retrofit_folder_path):  # If not empty
+                summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
+                if summary_data:
+                    summary_data = {
+                        "survey_folder": survey_folder,
+                        **summary_data,
+                    }
+                    extracted_data.append(summary_data)
+                    continue
+            else:
+                # Then we have an empty Retrofit Assessment folder
+                continue
+
+        # If no retrofit folder or it was empty, check files in survey_folder
+        summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+        if not summary_data:
+            if len(survey_subfolders) == 1:
+                survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
+                summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+
+        if summary_data:
+            summary_data = {
+                "survey_folder": survey_folder,
+                **summary_data,
+            }
+            extracted_data.append(summary_data)
+
+    retrofit_assessment_data = pd.DataFrame(extracted_data)
+    # TODO - Save this data
+
 # if __name__ == "__main__":
 #     main()

From ca7a0e9d107c7da66fd7a8d5066834b7dbf00978 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 28 Jan 2025 22:15:53 +0000
Subject: [PATCH 03/72] debugging extract epr for old elmhurst epr

---
 .../stonewater/Wave 3 Preparation.py          | 29 +++++++++++++++++--
 etl/route_march_data_pull/app.py              | 18 +++++-------
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 105628e9..ee314f17 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -747,12 +747,30 @@ def extract_epr(pdf_path):
 
         # Extract Current and Potential SAP ratings
         sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
-        current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
-        data["Current SAP Rating"] = current_sap
+        if sap_match is None:
+            # Handles the older format of the elmhurst EPR
+            # The text will look something like this:
+            # Least energy efficient - higher running costsD 61 - we extract D 61
+            sap_match = re.search(
+                r"(?P<current_epc>[A-G])\s(?P<current_sap>\d{1,3})(?P<potential_epc>[A-G])\s(?P<potential_sap>\d{1,3})",
+                text)
+            data["Current EPC Band"] = sap_match.group("current_epc")
+            data["Current SAP Rating"] = int(sap_match.group("current_sap"))
+        else:
+            current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
+            data["Current SAP Rating"] = current_sap
 
         # Extract the primary energy use intensity
         additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
-        data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
+        if additional_rating_match:
+            data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
+        else:
+            # Handles the older format of the Elmhurst EPR
+            primary_energy_match = re.search(r"actual consumption\.\n(?P<primary_energy>\d+)", text)
+            data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy"))
+            # We calculate the primary energy use intensity by dividing by floor area
+            floor_area = re.search(r"Total Floor Area\s(?P<floor_area>\d+)\s?m2", text).group("floor_area")
+            data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area)
 
         # Extract Number of Storeys
         storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
@@ -2983,8 +3001,13 @@ def revised_model():
     # We now do a large pull of all of the data
     extracted_data = []
     for survey_folder in tqdm(survey_folders):
+
         survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
 
+        # Check that the survey folder is actually a folder
+        if not os.path.isdir(survey_folder_path):
+            continue
+
         # List the folders inside of the survey folder
         survey_subfolders = [
             name for name in os.listdir(survey_folder_path)
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 8d19aa84..247ce98c 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -162,19 +162,17 @@ def app():
     Property UPRN
 
     """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern"
-    DATA_FILENAME = "January 2025 Additions Query.xlsx"
-    SHEET_NAME = "Jan 2025 additions"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing"
+    DATA_FILENAME = "For Housing Data pull.xlsx"
+    SHEET_NAME = "Sheet1"
     POSTCODE_COLUMN = "Post Code"
-    FULLADDRESS_COLUMN = "Street / Block Name"
-    ADDRESS1_COLUMN = None
-    ADDRESS1_METHOD = "first_word"
-    ADDRESS_COLS_TO_CONCAT = []
+    FULLADDRESS_COLUMN = None
+    ADDRESS1_COLUMN = "NO."
+    ADDRESS1_METHOD = None
+    ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"]
 
     # Maps addresses to uprn in problematic cases
-    MANUAL_UPRN_MAP = {
-        "Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560
-    }
+    MANUAL_UPRN_MAP = {}
 
     asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
     asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()

From fd98721748c9da95c3660116f33b6aa00d1be01f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 15:24:02 +0000
Subject: [PATCH 04/72] debugging epr extraction when the dimensions are
 external

---
 etl/customers/stonewater/Wave 3 Preparation.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index ee314f17..4db089e7 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -465,7 +465,11 @@ def extract_building_parts_summary(text):
         r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
     )
     if not dimensions_section:
-        raise ValueError("Failed to locate dimensions section in the text.")
+        dimensions_section = re.search(
+            r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
+        )
+        if not dimensions_section:
+            raise ValueError("Failed to locate dimensions section in the text.")
 
     dimensions_text = dimensions_section.group(1)
 
@@ -898,11 +902,18 @@ def detect_report_type(pdf_path, pdf_file):
     """
     # Attempt to read the first page of the PDF to determine type
     with open(pdf_path, "rb") as file:
+        # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter
+        # This is because the pdf is irregular. We could possibly try a library like fitz to handle this
         reader = PyPDF2.PdfReader(file)
         first_page_text = reader.pages[0].extract_text() if reader.pages else ""
+        n_pages = len(reader.pages)
 
-        if is_energy_report(first_page_text):
+        if is_energy_report(first_page_text) and n_pages > 3:
+            # The EPR should have more than 3 pages
             return "epr"
+        elif is_energy_report(first_page_text) and n_pages <= 3:
+            # This is a shortened version of the EPR which isn't massively useful
+            return "short_form_epr"
         elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
             return "summary"
         elif is_condition_report(first_page_text):

From 231069f4e3e4ca2a40e114db0963c55aa56b09b7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 20:37:06 +0000
Subject: [PATCH 05/72] matching algorithm wip

---
 .../stonewater/Wave 3 Preparation.py          | 275 +++++++++++++++++-
 1 file changed, 274 insertions(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 4db089e7..904afd30 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3077,7 +3077,280 @@ def revised_model():
             extracted_data.append(summary_data)
 
     retrofit_assessment_data = pd.DataFrame(extracted_data)
-    # TODO - Save this data
+
+    # Remove some definite duplicates
+    dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"]
+    dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)]
+    dupes = dupes.sort_values("Address")
+    # Get all of the folders that end with ROSS
+    to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist()
+
+    retrofit_assessment_data = retrofit_assessment_data[
+        ~retrofit_assessment_data["survey_folder"].isin(
+            [
+                "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS",
+                "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS",
+                "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS"
+            ] + to_drop
+        )
+    ]
+    # Replace \n with ""
+    retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")
+
+    # retrofit_assessment_data.to_csv(
+    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet.csv"), index=False
+    # )
+
+    # We can read in the data as needed
+
+    # Next Step: Read in the coordinated measures and match to the extracted data
+    ############################################################
+    # CCS
+    #############################################################
+    ccs_coordination_sheet = pd.read_excel(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"),
+        header=4
+    )
+    ccs_coordination_sheet["contractor"] = "CCS"
+    # We split ccs into two sections - the first being
+    ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21)
+    ccs_coordination_sheet = ccs_coordination_sheet.head(87)
+    ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet])
+
+    ############################################################
+    # WATES
+    #############################################################
+    wates_coordination_sheet = pd.read_excel(
+        os.path.join(
+            CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx"
+        ),
+        header=4
+    )
+    wates_coordination_sheet["contractor"] = "Wates"
+    # Break into the different sites:
+    # Wiltshire
+    wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267)
+    wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :]
+    wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :]
+    wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :]
+    wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :]
+    wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :]
+    wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :]
+    wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[928:972, :]
+
+    wates_coordination = pd.concat(
+        [
+            wates_coordination_sheet_wiltshere,
+            wates_coordination_sheet_herefordshire,
+            wates_coordination_sheet_coventry,
+            wates_coordination_sheet_bedfordshire,
+            wates_coordination_sheet_bournemouth,
+            wates_coordination_sheet_cambridgeshire,
+            wates_coordination_sheet_removed_from_programme,
+            wates_coordination_sheet_abeyance
+        ]
+    )
+
+    # Combine the data back
+
+    ############################################################
+    # NEW 450 COORDINATED RETROFIT ASSESSMENTS
+    #############################################################
+
+    retrofit_packages_board = pd.read_excel(
+        os.path.join(
+            CUSTOMER_FOLDER_PATH,
+            "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx"
+        ),
+        header=4
+    )
+    retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
+    # Take just the rows that have been surveyed
+    retrofit_packages_board = retrofit_packages_board[
+        retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
+    ]
+
+    manual_filters = {
+        "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD",
+        "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG",
+        "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ",
+        'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT",
+        '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT',
+        '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY',
+        'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN',
+        'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB',
+        '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS',
+        '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+        '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY',
+        '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW',
+        '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS',
+        '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX',
+        '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX',
+        '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+        '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ',
+        '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG",
+        '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX',
+        "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX',
+        '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX',
+        '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ',
+        '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX',
+        '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA'
+    }
+
+    # We now match this retrofit packages board to the extracted data
+    matching_lookup = []
+    for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
+
+        # Handle the case that has the wrong postcode in the asset data
+        if home["Name"] in manual_filters:
+            filtered = retrofit_assessment_data[
+                retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]]
+                ].copy()
+        else:
+            filtered = retrofit_assessment_data[
+                retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+                ].copy()
+
+            # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+            to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+                home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+            )
+            if to_filter.sum() == 0:
+                to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".",
+                                                                                                                   "").str.contains(
+                    home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+                )
+            filtered = filtered[to_filter]
+
+        if filtered.empty:
+            continue
+
+        if filtered.shape[0] == 1:
+            matching_lookup.append(
+                {
+                    "survey_folder": filtered["survey_folder"].values[0],
+                    "Address ID": home["Address ID"],
+                    "Name": home["Name"]
+                }
+            )
+            continue
+
+        # home["Name"] should be contained in the survey_folder
+        filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
+        # We have an edge case wher some properties have two outputs in Sharepoint
+        if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
+            raise Exception("Fix me1")
+            # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
+
+        if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+            raise Exception("Fix me2")
+            # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+
+        if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
+            filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
+
+        if filtered.empty:
+            continue
+        if filtered.shape[0] != 1:
+            raise Exception("something went wrong")
+
+        matching_lookup.append(
+            {
+                "survey_folder": filtered["survey_folder"].values[0],
+                "Address ID": home["Address ID"],
+                "Name": home["Name"]
+            }
+        )
+    matching_lookup = pd.DataFrame(matching_lookup)
+
+    ccs_coordination = ccs_coordination.rename(
+        columns={"Post Code": "Postcode"}
+    )
+    ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])]
+    from fuzzywuzzy import fuzz
+
+    ccs_manual_filters = {}
+    ccs_matching_lookup = []
+    for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)):
+        # Handle the case that has the wrong postcode in the asset data
+        if home["Name"] in manual_filters:
+            filtered = retrofit_assessment_data[
+                retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]]
+                ].copy()
+        else:
+            filtered = retrofit_assessment_data[
+                retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+                ].copy()
+
+            # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+            to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+                home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+            )
+            if to_filter.sum() == 0:
+                to_filter = (
+                    filtered["survey_folder"].
+                    str.replace(r"[^\w\s]", "").
+                    str.replace(",", "").
+                    str.replace(".", "").
+                    str.contains(
+                        home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+                    )
+                )
+            if to_filter.sum() == 0:
+                to_filter = filtered["Address"].str.split(",").str[0:2].str.join("") == home["Name"]
+
+            if to_filter.sum() == 0:
+                # Do a fuzzy match on the name
+                # Find the best filter
+                to_filter = filtered["Address"].str.split(",").str[0:2].str.join("").apply(
+                    lambda x: fuzz.partial_ratio(home["Name"], x) > 9
+                )
+
+            if to_filter.sum() == 0:
+                blah
+            filtered = filtered[to_filter]
+
+        if filtered.empty:
+            continue
+
+        if filtered.shape[0] == 1:
+            ccs_matching_lookup.append(
+                {
+                    "survey_folder": filtered["survey_folder"].values[0],
+                    "Asset ID.1": home["Asset ID.1"],
+                    "Name": home["Name"]
+                }
+            )
+            continue
+
+        blah2
+
+        # home["Name"] should be contained in the survey_folder
+        # filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
+        # # We have an edge case wher some properties have two outputs in Sharepoint
+        # if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
+        #     raise Exception("Fix me1")
+        #     # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
+        #
+        # if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+        #     raise Exception("Fix me2")
+        #     # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+        #
+        # if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
+        #     filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
+        #
+        # if filtered.empty:
+        #     continue
+        # if filtered.shape[0] != 1:
+        #     raise Exception("something went wrong")
+        #
+        # matching_lookup.append(
+        #     {
+        #         "survey_folder": filtered["survey_folder"].values[0],
+        #         "Address ID": home["Address ID"],
+        #         "Name": home["Name"]
+        #     }
+        # )
 
 # if __name__ == "__main__":
 #     main()

From 7dd64781724df896badfd2170cba3ba5d2c283b9 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 20:43:56 +0000
Subject: [PATCH 06/72] Added more logic for matching

---
 etl/customers/stonewater/Wave 3 Preparation.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 904afd30..ab640496 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3273,7 +3273,7 @@ def revised_model():
     ccs_matching_lookup = []
     for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)):
         # Handle the case that has the wrong postcode in the asset data
-        if home["Name"] in manual_filters:
+        if home["Name"] in ccs_manual_filters:
             filtered = retrofit_assessment_data[
                 retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]]
                 ].copy()
@@ -3297,13 +3297,16 @@ def revised_model():
                     )
                 )
             if to_filter.sum() == 0:
-                to_filter = filtered["Address"].str.split(",").str[0:2].str.join("") == home["Name"]
-
+                to_filter = filtered["Address"].str.replace("  ,", "").str.split(",").str[0:2].str.join("") == home[
+                    "Name"]
+            if to_filter.sum() == 0:
+                to_filter = filtered["Address"].str.replace("  ,", "").str.split(",").str[0:1].str.join("") == home[
+                    "Name"]
             if to_filter.sum() == 0:
                 # Do a fuzzy match on the name
                 # Find the best filter
-                to_filter = filtered["Address"].str.split(",").str[0:2].str.join("").apply(
-                    lambda x: fuzz.partial_ratio(home["Name"], x) > 9
+                to_filter = filtered["Address"].str.replace("  ,", "").str.split(",").str[0:2].str.join("").apply(
+                    lambda x: fuzz.partial_ratio(home["Name"], x) > 93
                 )
 
             if to_filter.sum() == 0:

From 0331d82f6ac687b55297e80f430a15fa148f5d67 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 20:55:36 +0000
Subject: [PATCH 07/72] added manual match

---
 .../stonewater/Wave 3 Preparation.py          | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index ab640496..61344038 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3267,15 +3267,19 @@ def revised_model():
         columns={"Post Code": "Postcode"}
     )
     ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])]
+    ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"]
     from fuzzywuzzy import fuzz
 
-    ccs_manual_filters = {}
+    ccs_manual_filters = {
+        "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35"
+    }
     ccs_matching_lookup = []
     for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)):
+
         # Handle the case that has the wrong postcode in the asset data
         if home["Name"] in ccs_manual_filters:
             filtered = retrofit_assessment_data[
-                retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]]
+                retrofit_assessment_data["survey_folder"] == ccs_manual_filters[home["Name"]]
                 ].copy()
         else:
             filtered = retrofit_assessment_data[
@@ -3297,11 +3301,15 @@ def revised_model():
                     )
                 )
             if to_filter.sum() == 0:
-                to_filter = filtered["Address"].str.replace("  ,", "").str.split(",").str[0:2].str.join("") == home[
-                    "Name"]
+                to_filter = (
+                    filtered["Address"].str.replace("  ,", "").str.split(",").str[0:2].str.join("").str.lower() ==
+                    home["Name"].lower()
+                )
             if to_filter.sum() == 0:
-                to_filter = filtered["Address"].str.replace("  ,", "").str.split(",").str[0:1].str.join("") == home[
-                    "Name"]
+                to_filter = (
+                    filtered["Address"].str.replace("  ,", "").str.split(",").str[0:1].str.join("").str.lower() ==
+                    home["Name"].lower()
+                )
             if to_filter.sum() == 0:
                 # Do a fuzzy match on the name
                 # Find the best filter

From 678a4b52d28194d1dcf7c2d86d3993dde0161f3f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 21:03:11 +0000
Subject: [PATCH 08/72] matching for all of ccs

---
 etl/customers/stonewater/Wave 3 Preparation.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 61344038..fa548f0d 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3316,6 +3316,19 @@ def revised_model():
                 to_filter = filtered["Address"].str.replace("  ,", "").str.split(",").str[0:2].str.join("").apply(
                     lambda x: fuzz.partial_ratio(home["Name"], x) > 93
                 )
+            if to_filter.sum() == 0:
+                # We also some cases where the name of the survey folder is like "Colville Road 7" and the
+                # property name is actually 7 Colville Road, so we try taking the final part of the address,
+                # splitting on space, and adding it to the front
+                def reformat_survey_folder(x):
+                    filename = x.split("/")[-1]
+                    parts = filename.split(" ")
+                    return " ".join(parts[-1:] + parts[:-1])
+
+                to_filter = (
+                    filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() ==
+                    home["Name"].lower()
+                )
 
             if to_filter.sum() == 0:
                 blah

From 7291f7128e6b5403132e5afdcc56330ea3d71f15 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 21:11:29 +0000
Subject: [PATCH 09/72] started wates matching

---
 .../stonewater/Wave 3 Preparation.py          | 119 +++++++++++++-----
 1 file changed, 91 insertions(+), 28 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index fa548f0d..cbbf04c6 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3331,7 +3331,7 @@ def revised_model():
                 )
 
             if to_filter.sum() == 0:
-                blah
+                raise Exception("Error")
             filtered = filtered[to_filter]
 
         if filtered.empty:
@@ -3347,34 +3347,97 @@ def revised_model():
             )
             continue
 
-        blah2
+        raise Exception("No match")
 
-        # home["Name"] should be contained in the survey_folder
-        # filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
-        # # We have an edge case wher some properties have two outputs in Sharepoint
-        # if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
-        #     raise Exception("Fix me1")
-        #     # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
-        #
-        # if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
-        #     raise Exception("Fix me2")
-        #     # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
-        #
-        # if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
-        #     filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
-        #
-        # if filtered.empty:
-        #     continue
-        # if filtered.shape[0] != 1:
-        #     raise Exception("something went wrong")
-        #
-        # matching_lookup.append(
-        #     {
-        #         "survey_folder": filtered["survey_folder"].values[0],
-        #         "Address ID": home["Address ID"],
-        #         "Name": home["Name"]
-        #     }
-        # )
+    ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup)
+    # We get a match for all records
+    assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0]
+    assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum()
+
+    # We do the same for Wates
+    wates_coordination = wates_coordination.rename(
+        columns={"Post Code": "Postcode"}
+    )
+    wates_coordination = wates_coordination[
+        wates_coordination["Retrofit Assessment"].isin(["Completed"])
+    ]
+
+    wates_manual_filters = {}
+    wates_matching_lookup = []
+    for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)):
+
+        # Handle the case that has the wrong postcode in the asset data
+        if home["Name"] in wates_manual_filters:
+            filtered = retrofit_assessment_data[
+                retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]]
+                ].copy()
+        else:
+            filtered = retrofit_assessment_data[
+                retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+                ].copy()
+
+            # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+            to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+                home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+            )
+            if to_filter.sum() == 0:
+                to_filter = (
+                    filtered["survey_folder"].
+                    str.replace(r"[^\w\s]", "").
+                    str.replace(",", "").
+                    str.replace(".", "").
+                    str.contains(
+                        home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+                    )
+                )
+            if to_filter.sum() == 0:
+                to_filter = (
+                    filtered["Address"].str.replace("  ,", "").str.split(",").str[0:2].str.join("").str.lower() ==
+                    home["Name"].lower()
+                )
+            if to_filter.sum() == 0:
+                to_filter = (
+                    filtered["Address"].str.replace("  ,", "").str.split(",").str[0:1].str.join("").str.lower() ==
+                    home["Name"].lower()
+                )
+            if to_filter.sum() == 0:
+                # Do a fuzzy match on the name
+                # Find the best filter
+                to_filter = filtered["Address"].str.replace("  ,", "").str.split(",").str[0:2].str.join("").apply(
+                    lambda x: fuzz.partial_ratio(home["Name"], x) > 93
+                )
+            if to_filter.sum() == 0:
+                # We also some cases where the name of the survey folder is like "Colville Road 7" and the
+                # property name is actually 7 Colville Road, so we try taking the final part of the address,
+                # splitting on space, and adding it to the front
+                def reformat_survey_folder(x):
+                    filename = x.split("/")[-1]
+                    parts = filename.split(" ")
+                    return " ".join(parts[-1:] + parts[:-1])
+
+                to_filter = (
+                    filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() ==
+                    home["Name"].lower()
+                )
+
+            if to_filter.sum() == 0:
+                raise Exception("Error")
+            filtered = filtered[to_filter]
+
+        if filtered.empty:
+            continue
+
+        if filtered.shape[0] == 1:
+            wates_matching_lookup.append(
+                {
+                    "survey_folder": filtered["survey_folder"].values[0],
+                    "Asset ID": home["Asset ID"],
+                    "Name": home["Name"]
+                }
+            )
+            continue
+
+        raise Exception("No match")
 
 # if __name__ == "__main__":
 #     main()

From b1936521f6f3c3585057d5f2ce10d1998e558400 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 21:16:18 +0000
Subject: [PATCH 10/72] added manual match

---
 etl/customers/stonewater/Wave 3 Preparation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index cbbf04c6..8a00604b 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3362,7 +3362,9 @@ def revised_model():
         wates_coordination["Retrofit Assessment"].isin(["Completed"])
     ]
 
-    wates_manual_filters = {}
+    wates_manual_filters = {
+        "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View"
+    }
     wates_matching_lookup = []
     for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)):
 

From 1814d7b6709cd7861db5c15ac6821a601708882e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 29 Jan 2025 21:21:08 +0000
Subject: [PATCH 11/72] 11% through matching

---
 etl/customers/stonewater/Wave 3 Preparation.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 8a00604b..7cbf04f1 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3382,6 +3382,13 @@ def revised_model():
             to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
                 home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
             )
+
+            if to_filter.sum() > 1:
+                to_filter = (
+                    filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.split("/").str[-1].str.lower() ==
+                    home["Name"].replace(r"[^\w\s]", "").lstrip().lower()
+                )
+
             if to_filter.sum() == 0:
                 to_filter = (
                     filtered["survey_folder"].

From b4296db52d7b3c3e26ce3869ac31753bd731c379 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 30 Jan 2025 00:51:39 +0000
Subject: [PATCH 12/72] adding quidos extraction functions

---
 .../stonewater/Wave 3 Preparation.py          |  7 ++
 survey_report/app.py                          | 44 +++++++++
 .../extraction/detect_report_type.py          | 19 ++++
 survey_report/extraction/quidos.py            | 99 +++++++++++++++++++
 survey_report/requirements.txt                |  0
 5 files changed, 169 insertions(+)
 create mode 100644 survey_report/app.py
 create mode 100644 survey_report/extraction/detect_report_type.py
 create mode 100644 survey_report/extraction/quidos.py
 create mode 100644 survey_report/requirements.txt

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 7cbf04f1..70c531c0 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3366,8 +3366,15 @@ def revised_model():
         "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View"
     }
     wates_matching_lookup = []
+    # Examples to skip when we cannot get the data
+    wates_to_skip = [
+        "66 Abbatt Close",  # File type is unusual, couldn't extract the data
+    ]
     for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)):
 
+        if home["Name"] in wates_to_skip:
+            continue
+
         # Handle the case that has the wrong postcode in the asset data
         if home["Name"] in wates_manual_filters:
             filtered = retrofit_assessment_data[
diff --git a/survey_report/app.py b/survey_report/app.py
new file mode 100644
index 00000000..825a3658
--- /dev/null
+++ b/survey_report/app.py
@@ -0,0 +1,44 @@
+import os
+import PyPDF2
+from survey_report.extraction.detect_report_type import detect_report_type
+from survey_report.extraction.quidos import SiteNotesExtractor
+
+
+def handle():
+    """
+    Performs the data extraction process for the survey report
+    :return:
+    """
+
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2"
+
+    folder_contents = os.listdir(data_folder)
+    # We look for the following files:
+    # Site notes
+    file_mapping = {}
+    for file in folder_contents:
+        # Check if it's a pdf file
+        if not file.endswith(".pdf"):
+            continue
+        filepath = os.path.join(data_folder, file)
+        with (open(filepath, "rb") as f):
+            pdf = PyPDF2.PdfReader(f)
+            first_page = pdf.pages[0].extract_text()
+            text = ""
+            for page in pdf.pages:
+                text += page.extract_text()
+
+        # Check the report type
+        report_type = detect_report_type(first_page)
+        if report_type is not None:
+            file_mapping[report_type] = text
+
+        # Check the report type
+        report_type = detect_report_type(os.path.join(data_folder, file))
+
+    # This is only set up to work with quido site notes so we must have it
+    if "quidos_site_notes" not in file_mapping:
+        raise ValueError("No quidos site notes found")
+
+    site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
+    site_notes = site_notes_extractor.extract_all()
diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py
new file mode 100644
index 00000000..fe1600e7
--- /dev/null
+++ b/survey_report/extraction/detect_report_type.py
@@ -0,0 +1,19 @@
+import re
+
+
+def detect_report_type(first_page):
+    """
+    Detects the type of report based on the first page of the report
+    :param first_page:
+    :return:
+    """
+    # Set up for the minute to handle quidos files. We have the Elmhurst logic so we can introduce
+    # this when we need
+
+    if re.match(
+        r"^Created \d{2}/\d{2}/\d{4} for Quidos Ltd using Argyle software BRE approved calculator",
+        first_page
+    ):
+        return "quidos_site_notes"
+
+    return None
diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py
new file mode 100644
index 00000000..f11ffcb1
--- /dev/null
+++ b/survey_report/extraction/quidos.py
@@ -0,0 +1,99 @@
+import re
+
+
+class SiteNotesExtractor:
+    """
+    Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report.
+    """
+
+    def __init__(self, pdf_text):
+        """
+        Initializes the SiteNotesExtractor with the extracted PDF text.
+        """
+        self.text = pdf_text
+        self.data = {}
+
+    def extract_sap_rating(self):
+        """
+        Extracts the current and potential SAP rating from the report.
+        """
+        pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text)
+
+        if not pattern:
+            raise ValueError("No SAP rating found in the report")
+
+        self.data.update({
+            "Current EPC Band": pattern.group(1),
+            "Current SAP Rating": int(pattern.group(2)),
+            "Potential EPC Band": pattern.group(3),
+            "Potential SAP Rating": int(pattern.group(4)),
+        })
+
+    def extract_carbon_emissions(self):
+        """
+        Extracts the current and adjusted annual carbon emissions (TCO2).
+        """
+        pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text)
+
+        if not pattern:
+            raise ValueError("No carbon emissions found in the report")
+
+        self.data.update({
+            "Current Carbon Emissions (TCO2)": float(pattern.group(1)),
+        })
+
+    def extract_building_dimensions(self):
+        """
+        Extracts dimensions for each building part and stores them in a list.
+        Handles Main Property and multiple extensions.
+        """
+
+        # Locate the Dimensions section
+        dimensions_section = re.search(
+            r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) "
+            r"Party Wall "
+            r"Length \(m\)\n"
+            r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL
+        )
+
+        if not dimensions_section:
+            raise ValueError("Failed to locate the dimensions section in the text.")
+
+        dimensions_text = dimensions_section.group(1)
+
+        # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.)
+        building_part_pattern = re.compile(
+            r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+        )
+
+        building_parts = []
+        for match in building_part_pattern.finditer(dimensions_text):
+            to_append = {
+                "Building Part": match.group(1).strip(),
+                "Part Floor Area (m2)": float(match.group(2)),
+                "Room Height (m)": float(match.group(3)),
+                "Loss Perimeter (m)": float(match.group(4)),
+                "Party Wall Length (m)": float(match.group(5)),
+            }
+            # We calculate the heat loss area
+            to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"]
+            building_parts.append(to_append)
+
+        if not building_parts:
+            raise ValueError("No building dimensions found in the report")
+
+        self.data["Building Dimensions"] = building_parts
+        # We calculate some totals
+        self.data["Total Building Dimensions"] = {
+            "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]),
+            "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]),
+        }
+
+    def extract_all(self):
+        """
+        Runs all extraction methods and returns a dictionary with extracted data.
+        """
+        self.extract_sap_rating()
+        self.extract_carbon_emissions()
+        self.extract_building_dimensions()
+        return self.data
diff --git a/survey_report/requirements.txt b/survey_report/requirements.txt
new file mode 100644
index 00000000..e69de29b

From 32b053e7db3b08445b1649d6c418f33c5b235647 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 30 Jan 2025 00:54:56 +0000
Subject: [PATCH 13/72] extracting bills

---
 survey_report/extraction/quidos.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py
index f11ffcb1..ae66dd0d 100644
--- a/survey_report/extraction/quidos.py
+++ b/survey_report/extraction/quidos.py
@@ -89,11 +89,23 @@ class SiteNotesExtractor:
             "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]),
         }
 
+    def extract_bills_estimate(self):
+        """
+        Extracts the estimated annual energy costs (£) from the report.
+        """
+        pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text)
+
+        if not pattern:
+            raise ValueError("No bills estimate found in the report")
+
+        self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", ""))
+
     def extract_all(self):
         """
         Runs all extraction methods and returns a dictionary with extracted data.
         """
         self.extract_sap_rating()
         self.extract_carbon_emissions()
+        self.extract_bills_estimate()
         self.extract_building_dimensions()
         return self.data

From daabf2a586eec7bf31440696f014ad7035a0033e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 30 Jan 2025 01:09:41 +0000
Subject: [PATCH 14/72] extracting epr

---
 survey_report/app.py                          | 20 ++++---
 .../extraction/detect_report_type.py          |  3 +
 survey_report/extraction/quidos.py            | 55 +++++++++++++++++++
 3 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/survey_report/app.py b/survey_report/app.py
index 825a3658..f59c9984 100644
--- a/survey_report/app.py
+++ b/survey_report/app.py
@@ -1,7 +1,7 @@
 import os
 import PyPDF2
 from survey_report.extraction.detect_report_type import detect_report_type
-from survey_report.extraction.quidos import SiteNotesExtractor
+from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor
 
 
 def handle():
@@ -33,12 +33,18 @@ def handle():
         if report_type is not None:
             file_mapping[report_type] = text
 
-        # Check the report type
-        report_type = detect_report_type(os.path.join(data_folder, file))
-
     # This is only set up to work with quido site notes so we must have it
-    if "quidos_site_notes" not in file_mapping:
-        raise ValueError("No quidos site notes found")
-
     site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
     site_notes = site_notes_extractor.extract_all()
+
+    # We also must have an EPR
+    epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
+    epr = epr_extractor.extract_all()
+
+    # We now produce the combined data sheet which is the starting figure:
+    data_sheet = {**epr, **site_notes}
+    del data_sheet['Building Dimensions']
+    # We unnest the Total Building Dimensions
+    data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
+    data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
+    del data_sheet["Total Building Dimensions"]
diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py
index fe1600e7..434a3fb4 100644
--- a/survey_report/extraction/detect_report_type.py
+++ b/survey_report/extraction/detect_report_type.py
@@ -16,4 +16,7 @@ def detect_report_type(first_page):
     ):
         return "quidos_site_notes"
 
+    if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page):
+        return "quidos_epr"
+
     return None
diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py
index ae66dd0d..374df084 100644
--- a/survey_report/extraction/quidos.py
+++ b/survey_report/extraction/quidos.py
@@ -109,3 +109,58 @@ class SiteNotesExtractor:
         self.extract_bills_estimate()
         self.extract_building_dimensions()
         return self.data
+
+
+class EPRExtractor:
+    """
+    Extracts space heating, water heating, and address from an Energy Performance Report (EPR).
+    """
+
+    def __init__(self, pdf_text):
+        """
+        Initializes the EPRExtractor with the extracted PDF text.
+        """
+        self.text = pdf_text
+        self.data = {}
+
+    def extract_heating_data(self):
+        """
+        Extracts space heating and water heating values from the report.
+        """
+        pattern = re.search(
+            r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)",
+            self.text,
+            re.DOTALL
+        )
+
+        if not pattern:
+            raise ValueError("No heating data found in the report")
+
+        self.data.update({
+            "Space Heating (KWH)": int(pattern.group(1).replace(",", "")),
+            "Water Heating (KWH)": int(pattern.group(2).replace(",", ""))
+        })
+
+    def extract_address(self):
+        """
+        Extracts the full address from the report.
+        """
+        pattern = re.search(
+            r"Address\s*(.*?)\nTown\s*(.*?)\n",
+            self.text,
+            re.DOTALL
+        )
+
+        if not pattern:
+            raise ValueError("No address found in the report")
+
+        full_address = pattern.group(1).strip()
+        self.data["Address"] = full_address
+
+    def extract_all(self):
+        """
+        Runs all extraction methods and returns a dictionary with extracted data.
+        """
+        self.extract_address()
+        self.extract_heating_data()
+        return self.data

From f6d8688698bfcdc1c9d1230b9040dfe071e2bf1e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 30 Jan 2025 17:30:17 +0000
Subject: [PATCH 15/72] completed matching

---
 .../stonewater/Wave 3 Preparation.py          |  89 +++++++++++--
 etl/customers/stonewater/data_cleaning.py     |   5 +-
 survey_report/app.py                          |  41 ++++++
 survey_report/template.html                   | 123 ++++++++++++++++++
 4 files changed, 248 insertions(+), 10 deletions(-)
 create mode 100644 survey_report/template.html

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 70c531c0..d9b5c41d 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3078,6 +3078,13 @@ def revised_model():
 
     retrofit_assessment_data = pd.DataFrame(extracted_data)
 
+    # retrofit_assessment_data.to_csv(
+    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), index=False
+    # )
+    retrofit_assessment_data = pd.read_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"),
+    )
+
     # Remove some definite duplicates
     dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"]
     dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)]
@@ -3097,10 +3104,6 @@ def revised_model():
     # Replace \n with ""
     retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")
 
-    # retrofit_assessment_data.to_csv(
-    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet.csv"), index=False
-    # )
-
     # We can read in the data as needed
 
     # Next Step: Read in the coordinated measures and match to the extracted data
@@ -3108,24 +3111,59 @@ def revised_model():
     # CCS
     #############################################################
     ccs_coordination_sheet = pd.read_excel(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"),
+        os.path.join(
+            CUSTOMER_FOLDER_PATH,
+            "Jan 2025 Project",
+            "CCS_Installation_Compliance_-_Stonewater_SHDF_2_1_1738228227.xlsx"
+        ),
         header=4
     )
+    ccs_postcodes = pd.read_excel(
+        os.path.join(
+            CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"
+        ),
+        header=4
+    )
+    ccs_coordination_sheet = ccs_postcodes[['Name', 'Post Code', 'Asset ID', 'Asset ID.1']].merge(
+        ccs_coordination_sheet, how="left", on="Name"
+    )
+    ccs_coordination_sheet = ccs_coordination_sheet[~pd.isnull(ccs_coordination_sheet["Name"])]
     ccs_coordination_sheet["contractor"] = "CCS"
     # We split ccs into two sections - the first being
     ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21)
     ccs_coordination_sheet = ccs_coordination_sheet.head(87)
     ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet])
 
+    from urllib import parse
+    def extract_sharepoint_url(x):
+        if pd.isnull(x):
+            return ""
+        return "/".join(parse.urlparse(
+            x.split(" - http")[1]
+        ).path.replace("%20", " ").split("/")[-2:])
+
+    ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x))
+
     ############################################################
     # WATES
     #############################################################
     wates_coordination_sheet = pd.read_excel(
+        os.path.join(
+            CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_1738229226.xlsx"
+        ),
+        header=4
+    )
+    wates_postcodes = pd.read_excel(
         os.path.join(
             CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx"
         ),
         header=4
     )
+    wates_postcodes = wates_postcodes[~pd.isnull(wates_postcodes["Post Code"])]
+    wates_coordination_sheet = wates_coordination_sheet.merge(
+        wates_postcodes[['Name', 'Post Code', 'Asset ID']].drop_duplicates(), how="left", on="Name"
+    )
+
     wates_coordination_sheet["contractor"] = "Wates"
     # Break into the different sites:
     # Wiltshire
@@ -3136,7 +3174,7 @@ def revised_model():
     wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :]
     wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :]
     wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :]
-    wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[928:972, :]
+    wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[930:972, :]
 
     wates_coordination = pd.concat(
         [
@@ -3151,12 +3189,15 @@ def revised_model():
         ]
     )
 
+    wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply(
+        lambda x: extract_sharepoint_url(x)
+    )
+
     # Combine the data back
 
     ############################################################
     # NEW 450 COORDINATED RETROFIT ASSESSMENTS
     #############################################################
-
     retrofit_packages_board = pd.read_excel(
         os.path.join(
             CUSTOMER_FOLDER_PATH,
@@ -3361,17 +3402,49 @@ def revised_model():
     wates_coordination = wates_coordination[
         wates_coordination["Retrofit Assessment"].isin(["Completed"])
     ]
+    wates_coordination = wates_coordination[
+        ~pd.isnull(wates_coordination["Postcode"])
+    ]
 
     wates_manual_filters = {
-        "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View"
+        "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View",
+        "14 Edencroft": "Wave 2.1 Surveys/3. Wiltshire/14 Edencroft",
+        "Flat 31 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/Flat 31  Rabley Wood View",
+        'Flat 13, Manor Fields': 'Wave 2.1 Surveys/1. Herefordshire/(038) Manor Fields Flat 13',
+        "4 Kittys Lane": "Wave 2.1 Surveys/1. Herefordshire/(005) Kittys Lane 4",
+        '1 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 1',
+        '2 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 2',
     }
     wates_matching_lookup = []
     # Examples to skip when we cannot get the data
     wates_to_skip = [
         "66 Abbatt Close",  # File type is unusual, couldn't extract the data
+        "Flat 69 Goddard Road",  # Doesn't exist
+        "19 Garth House",  # # File type is unusual, couldn't extract the data
+        '5 Gilpin Close',  # No properly formatted EPR
+        '49 The Hide, Netherfield',  # TODO: TEMP HERE
+        '19 Chanders Rd',
+        '5 Chanders Rd',
+        '23 Chanders Rd',
+        '3 Chanders Rd',
+        '1 Orchard Close',
     ]
     for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)):
 
+        # Search the folder
+        filtered = retrofit_assessment_data[
+            retrofit_assessment_data["survey_folder"].str.contains(home["folder_path"], regex=False)
+        ]
+        if len(filtered) == 1:
+            wates_matching_lookup.append(
+                {
+                    "survey_folder": filtered["survey_folder"].values[0],
+                    "Asset ID": home["Asset ID"],
+                    "Name": home["Name"]
+                }
+            )
+            continue
+
         if home["Name"] in wates_to_skip:
             continue
 
diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py
index 7ee06fcd..010902ce 100644
--- a/etl/customers/stonewater/data_cleaning.py
+++ b/etl/customers/stonewater/data_cleaning.py
@@ -86,7 +86,6 @@ def download_data_from_sharepoint():
         folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders"
     )
 
-    len(contents["value"])
     folders_to_pull = [
         folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"]
     ]
@@ -108,6 +107,8 @@ def download_data_from_sharepoint():
                 folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" +
                             folder_to_pull["name"] + "/" + property_folder["name"]
             )
+            if not property_folder_contents.get("value"):
+                continue
             # We look for the retrofit assessment folder:
             property_sub_folders = [
                 f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower()
@@ -138,5 +139,5 @@ def download_data_from_sharepoint():
                 drive_id=sharepoint_client.document_drive["id"],
                 folder_path=property_folder_path,
                 download_dir=download_dir,
-                excluded_file_types=["MOV"]
+                excluded_file_types=["MOV", "jpg"]
             )
diff --git a/survey_report/app.py b/survey_report/app.py
index f59c9984..87ce7864 100644
--- a/survey_report/app.py
+++ b/survey_report/app.py
@@ -1,9 +1,33 @@
 import os
 import PyPDF2
+from string import Template
 from survey_report.extraction.detect_report_type import detect_report_type
 from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor
 
 
+def generate_html_report(template_path, output_path, data):
+    """
+    Reads an HTML template file, injects dynamic values, and generates a final HTML report.
+
+    Args:
+    - template_path (str): Path to the HTML template file.
+    - output_path (str): Path to save the generated HTML file.
+    - data (dict): Dictionary containing dynamic values for the report.
+    """
+    # Read the template file
+    with open(template_path, "r", encoding="utf-8") as f:
+        html_template = Template(f.read())  # Use Template from string module
+
+    # Replace placeholders with actual data
+    final_html = html_template.safe_substitute(data)  # Use safe_substitute to prevent missing key errors
+
+    # Save the generated HTML file
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(final_html)
+
+    print(f"HTML report generated successfully: {output_path}")
+
+
 def handle():
     """
     Performs the data extraction process for the survey report
@@ -48,3 +72,20 @@ def handle():
     data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
     data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
     del data_sheet["Total Building Dimensions"]
+
+    # Generate the HTML report
+    # Placeholder locations
+    template_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/template.html"
+    output_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/output/report.html"
+    logo_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/assets/logo.png"
+    generate_html_report(
+        template_path, output_path,
+        data={
+            "address": data_sheet["Address"],
+            "logo_path": logo_path,
+            "current_epc": data_sheet["Current EPC Band"],
+            "current_sap": data_sheet["Current SAP Rating"],
+            "potential_epc": "A",  # TODO PLACEHOLDER
+            "potential_sap": 91,  # TODO PLACEHOLDER
+        }
+    )
diff --git a/survey_report/template.html b/survey_report/template.html
new file mode 100644
index 00000000..5d3b6c63
--- /dev/null
+++ b/survey_report/template.html
@@ -0,0 +1,123 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Domna Energy Report</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            background-color: #ffffff;
+            color: #333;
+            margin: 0;
+            padding: 0;
+            display: flex;
+            justify-content: center;
+        }
+        .container {
+            width: 100%;
+            max-width: 1300px;
+            margin: 20px auto;
+        }
+        .header {
+            background-color: #1B1F3B;
+            color: white;
+            padding: 30px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            border-radius: 12px;
+        }
+        .header h1 {
+            margin: 5;
+            font-size: 24px;
+        }
+        .header p {
+            margin: 5px 0 0;
+            font-size: 16px;
+            color: #d1d5db;
+        }
+        .logo img {
+            height: 60px;
+        }
+
+        /* EPC Rating Cards */
+        .epc-container {
+            display: flex;
+            justify-content: space-between;
+            gap: 20px;
+            margin-top: 30px;
+        }
+        .epc-card {
+            background-color: white;
+            border: 2px solid #ccc;
+            border-radius: 10px;
+            padding: 20px;
+            flex: 1;
+            display: flex;
+            flex-direction: column;
+            justify-content: space-between; /* Pushes SAP to bottom */
+            align-items: center;
+            text-align: center;
+            box-shadow: 2px 2px 10px rgba(0, 0, 0, 0.1);
+            position: relative;
+            height: 160px;
+        }
+        .epc-title {
+            font-size: 18px;
+            font-weight: bold;
+            color: #666;
+        }
+        .epc-rating {
+            font-size: 50px;
+            font-weight: bold;
+        }
+        .sap-rating {
+            font-size: 18px;
+            color: #555;
+            position: absolute;
+            bottom: 10px;
+            right: 20px;
+        }
+        .before .epc-rating {
+            color: #1B1F3B; /* Medium Blue */
+        }
+        .after .epc-rating {
+            color: #D4AF37; /* Gold */
+        }
+
+    </style>
+</head>
+<body>
+
+    <div class="container">
+        <!-- Header Section -->
+        <div class="header">
+            <div>
+                <h1>Domna Energy Report</h1>
+                <p>${address}</p> <!-- Address Placeholder -->
+            </div>
+            <div class="logo">
+                <img src="${logo_path}" alt="Domna Logo">
+            </div>
+        </div>
+
+        <!-- EPC Rating Cards -->
+        <div class="epc-container">
+            <div class="epc-card before">
+                <div class="epc-title">Current EPC Rating</div>
+                <div class="epc-rating">${current_epc}</div>
+                <div class="sap-rating">SAP ${current_sap}</div>
+            </div>
+
+            <div class="epc-card after">
+                <div class="epc-title">Potential EPC Rating</div>
+                <div class="epc-rating">${potential_epc}</div>
+                <div class="sap-rating">SAP ${potential_sap}</div>
+            </div>
+        </div>
+
+    </div>
+
+</body>
+</html>

From 01a5077c17cd219ddc907c48eaae4158c9117cfb Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 3 Feb 2025 12:54:57 +0000
Subject: [PATCH 16/72] tidying up stonewater work

---
 .../stonewater/Wave 3 Preparation.py          | 224 +++++++++++++++++-
 1 file changed, 221 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index d9b5c41d..5c4da35b 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1,4 +1,6 @@
 import os
+from pyexpat import features
+
 import PyPDF2
 import re
 import pandas as pd
@@ -1704,7 +1706,6 @@ def append_stonewater_id():
     )
     model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])]
     model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int)
-    z = model_proposed_sample["Archetype ID"].drop_duplicates().sort_values()
 
     original_archetypes = pd.read_excel(
         "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
@@ -2942,7 +2943,6 @@ def revised_model():
     """
 
     # 1) Create the new list of properties
-
     new_priority_postcodes = pd.read_excel(
         "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 "
         "priority list.xlsx"
@@ -3188,7 +3188,13 @@ def revised_model():
             wates_coordination_sheet_abeyance
         ]
     )
-
+    # We correct the Asset ID for 34 Kempster Close
+    wates_coordination["Asset ID"] = np.where(
+        wates_coordination["Name"] == "34 Kempster Close",
+        "12005",
+        wates_coordination["Asset ID"]
+    )
+    
     wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply(
         lambda x: extract_sharepoint_url(x)
     )
@@ -3198,6 +3204,14 @@ def revised_model():
     ############################################################
     # NEW 450 COORDINATED RETROFIT ASSESSMENTS
     #############################################################
+    features = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+        "master sheet.csv",
+        encoding='latin1'
+    )
+    features["Address ID"] = features["Address ID"].astype(str).astype(int)
+    features_to_merge = features[["Address ID", "Organisation Reference"]]
+
     retrofit_packages_board = pd.read_excel(
         os.path.join(
             CUSTOMER_FOLDER_PATH,
@@ -3211,6 +3225,10 @@ def revised_model():
         retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
     ]
 
+    retrofit_packages_board = retrofit_packages_board.merge(
+        features_to_merge, how="left", on="Address ID"
+    )
+
     manual_filters = {
         "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD",
         "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG",
@@ -3527,6 +3545,206 @@ def revised_model():
             continue
 
         raise Exception("No match")
+    wates_matching_lookup = pd.DataFrame(wates_matching_lookup)
+
+    # Merge lookup tables onto the coordination sheets
+    wates_coordination = wates_coordination.merge(
+        wates_matching_lookup, how="left", on="Name"
+    )
+    missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])]
+    if not missed_asset_id.empty:
+        # We fill the missing ids
+        missing_lookup = {
+            "4  Sydnall Fields": 31231,
+            "12  Sydnall Fields": 31239,
+            "12  Athena Gardens": 28061,
+            "49  Banner Lane": 41189,
+            "4  Jonathan Road": 41232,
+            "8  Jonathan Road": 41236,
+            "1  Jonathan Road": 41229,
+            "96  Taunton Way": 31417,
+            "94  Taunton Way": 31418,
+            "1  Lady Lane": 29430,
+            "10  Jonathan Road": 41283,
+            "21  Jonathan Road": 41246,
+            "12  Ashcroft Close": 26399
+        }
+        for name, asset_id in missing_lookup.items():
+            wates_coordination["Asset ID_x"] = np.where(
+                wates_coordination["Name"] == name,
+                asset_id,
+                wates_coordination["Asset ID_x"]
+            )
+
+    ccs_coordination = ccs_coordination.merge(
+        ccs_matching_lookup, how="left", on="Name"
+    )
+
+    retrofit_packages_board = retrofit_packages_board.merge(
+        matching_lookup, how="left", on="Name"
+    )
+
+    # We combine this into a singular board
+    coordinated_packages = pd.concat(
+        [
+            retrofit_packages_board[
+                [
+                    "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating',
+                    'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref',
+                    'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+                    'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+                    'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+                    'Solar PV', 'Other measures', 'Organisation Reference',
+                ]
+            ],
+            ccs_coordination[
+                [
+                    # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls,
+                    # Solar PV
+                    "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
+                    'SAP Band Install Package', 'Package Approved (Client)',
+                    'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
+                    'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y",
+                ]
+            ].rename(
+                columns={
+                    "SAP Band Pre": "Actual SAP Band",
+                    "SAP Rating Pre": "Actual SAP Rating",
+                    'SAP Rating Install Package': 'Modelled SAP Band',
+                    'SAP Band Install Package': 'Modelled SAP Rating',
+                    'Package Approved (Client)': 'Package Ref',
+                    'Wall Insulation': 'Main Wall Insulation',
+                    'Loft Insulation': 'Loft insulation',
+                    'Windows Upgrade': 'Window Upgrade',
+                    'Ext. Doors Upgrade': 'Door Upgrade',
+                    'Heating': 'Main Heating',
+                    'Other Measures': 'Other measures',
+                    'Asset ID.1_y': 'Organisation Reference',
+                }
+            ),
+            wates_coordination[
+                [
+                    "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
+                    'SAP Band Install Package', 'Package Approved (Client)',
+                    'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
+                    'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x'
+
+                ]
+            ].rename(
+                columns={
+                    "SAP Band Pre": "Actual SAP Band",
+                    "SAP Rating Pre": "Actual SAP Rating",
+                    'SAP Rating Install Package': 'Modelled SAP Band',
+                    'SAP Band Install Package': 'Modelled SAP Rating',
+                    'Package Approved (Client)': 'Package Ref',
+                    'Wall Insulation': 'Main Wall Insulation',
+                    'Loft Insulation': 'Loft insulation',
+                    'Windows Upgrade': 'Window Upgrade',
+                    'Ext. Doors Upgrade': 'Door Upgrade',
+                    'Heating': 'Main Heating',
+                    'Other Measures': 'Other measures',
+                    'Asset ID_x': 'Organisation Reference',
+                }
+            )
+        ]
+    )
+
+    coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int)
+
+    # Merge the property features on
+    coordinated_packages = coordinated_packages.merge(
+        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]],
+        how="left",
+        on="Organisation Reference"
+    )
+
+    # We need the features pertaining to these priority postcodes
+
+    def find_nearest_matching_property(coordinated_packages, home):
+        filter_levels = [
+            ["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"],
+            ["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"],
+            ["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"],
+            ["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"],
+            ["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"],
+            ["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"],
+        ]
+
+        for i, filters in enumerate(filter_levels):
+            match = coordinated_packages.copy()
+
+            for col in filters:
+                match = match[match[col] == home[col]]
+
+            if not match.empty:
+                return match
+
+        return None  # No match found
+
+    coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip()
+    new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip()
+
+    coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip()
+    new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip()
+
+    coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0]
+    new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0]
+
+    # For every property in the priority postcodes data, we look for a most appropriate matching property
+    no_match = []
+    matches = []
+    for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)):
+        closest_match = find_nearest_matching_property(coordinated_packages, home)
+        if closest_match is None:
+            no_match.append(home["Organisation Reference"])
+            continue
+
+        to_extend = [
+            {
+                "Organisation Reference": home["Organisation Reference"],
+                "Best Match Organisation Reference": m
+            } for m in closest_match["Organisation Reference"].values
+        ]
+        matches.extend(to_extend)
+
+    no_match_summary = new_priority_postcodes[
+        new_priority_postcodes["Organisation Reference"].isin(
+            no_match
+        )
+    ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[
+        "Organisation Reference"].count().reset_index()
+
+    no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False)
+
+    # len(no_match)
+    # 8764, 5607
+    # no_match_summary.shape
+    # (3953, 6), (2948, 6)
+
+    # We match the properties to their closest match
+
+    matches_df = pd.DataFrame(matches)
+    matches_df = matches_df.merge(
+        coordinated_packages[["Organisation Reference", "Actual SAP Band", "Actual SAP Rating"]],
+        left_on="Best Match Organisation Reference", right_on="Organisation Reference",
+        suffixes=("", " - Closest Match")
+    )
+    # We want to aggregate the matches, when we have multiple
+    aggregated_matches_df = []
+    for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
+        if mapped_matches.shape[0] == 1:
+            mapped_matches["Number of matches"] = 1
+            mapped_matches["Proportion"]
+            aggregated_matches_df.append(mapped_matches)
+            continue
+
+    mapped_priority_list = new_priority_postcodes.merge(
+        matches_df, on="Organisation Reference",
+    )
+    # We merge on the EPC ratings for the matched properties
+    mapped_priority_list = mapped_priority_list.merge(
+
+    )
 
 # if __name__ == "__main__":
 #     main()

From 04eba60961b0ea215701b2b35feaed74f9a5ef11 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 3 Feb 2025 13:04:10 +0000
Subject: [PATCH 17/72] fixing cleaning for stonewater

---
 .../stonewater/Wave 3 Preparation.py          | 58 +++++++++++--------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 5c4da35b..04078e47 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3194,7 +3194,32 @@ def revised_model():
         "12005",
         wates_coordination["Asset ID"]
     )
-    
+
+    # We fill the missing ids
+    missing_lookup = {
+        "4  Sydnall Fields": 31231,
+        "12  Sydnall Fields": 31239,
+        "12  Athena Gardens": 28061,
+        "49  Banner Lane": 41189,
+        "4  Jonathan Road": 41232,
+        "8  Jonathan Road": 41236,
+        "1  Jonathan Road": 41229,
+        "96  Taunton Way": 31417,
+        "94  Taunton Way": 31418,
+        "1  Lady Lane": 29430,
+        "10  Jonathan Road": 41283,
+        "21  Jonathan Road": 41246,
+        "12  Ashcroft Close": 26399
+    }
+    for name, asset_id in missing_lookup.items():
+        wates_coordination["Asset ID_x"] = np.where(
+            wates_coordination["Name"] == name,
+            asset_id,
+            wates_coordination["Asset ID_x"]
+        )
+
+    wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])]
+
     wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply(
         lambda x: extract_sharepoint_url(x)
     )
@@ -3412,6 +3437,7 @@ def revised_model():
     # We get a match for all records
     assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0]
     assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum()
+    assert not ccs_matching_lookup["Asset ID.1"].duplicated().sum()
 
     # We do the same for Wates
     wates_coordination = wates_coordination.rename(
@@ -3447,6 +3473,8 @@ def revised_model():
         '3 Chanders Rd',
         '1 Orchard Close',
     ]
+    wates_coordination = wates_coordination[~wates_coordination["Name"].isin(wates_to_skip)]
+
     for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)):
 
         # Search the folder
@@ -3547,34 +3575,18 @@ def revised_model():
         raise Exception("No match")
     wates_matching_lookup = pd.DataFrame(wates_matching_lookup)
 
+    # We get a match for all records
+    assert wates_matching_lookup.shape[0] == wates_coordination.shape[0]
+    assert not pd.isnull(wates_matching_lookup["Asset ID"]).sum()
+    assert not wates_matching_lookup["Asset ID"].duplicated().sum()
+
     # Merge lookup tables onto the coordination sheets
     wates_coordination = wates_coordination.merge(
         wates_matching_lookup, how="left", on="Name"
     )
     missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])]
     if not missed_asset_id.empty:
-        # We fill the missing ids
-        missing_lookup = {
-            "4  Sydnall Fields": 31231,
-            "12  Sydnall Fields": 31239,
-            "12  Athena Gardens": 28061,
-            "49  Banner Lane": 41189,
-            "4  Jonathan Road": 41232,
-            "8  Jonathan Road": 41236,
-            "1  Jonathan Road": 41229,
-            "96  Taunton Way": 31417,
-            "94  Taunton Way": 31418,
-            "1  Lady Lane": 29430,
-            "10  Jonathan Road": 41283,
-            "21  Jonathan Road": 41246,
-            "12  Ashcroft Close": 26399
-        }
-        for name, asset_id in missing_lookup.items():
-            wates_coordination["Asset ID_x"] = np.where(
-                wates_coordination["Name"] == name,
-                asset_id,
-                wates_coordination["Asset ID_x"]
-            )
+        raise Exception("Missing Asset ID")
 
     ccs_coordination = ccs_coordination.merge(
         ccs_matching_lookup, how="left", on="Name"

From 10bc433283417a2c15ffe2924537ded81af240d6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 3 Feb 2025 16:06:47 +0000
Subject: [PATCH 18/72] assigning properties to bands

---
 .../stonewater/Wave 3 Preparation.py          | 71 ++++++++++++++++---
 1 file changed, 62 insertions(+), 9 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 04078e47..c623e9f7 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3212,10 +3212,10 @@ def revised_model():
         "12  Ashcroft Close": 26399
     }
     for name, asset_id in missing_lookup.items():
-        wates_coordination["Asset ID_x"] = np.where(
+        wates_coordination["Asset ID"] = np.where(
             wates_coordination["Name"] == name,
             asset_id,
-            wates_coordination["Asset ID_x"]
+            wates_coordination["Asset ID"]
         )
 
     wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])]
@@ -3596,6 +3596,16 @@ def revised_model():
         matching_lookup, how="left", on="Name"
     )
 
+    # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board
+    to_remove = wates_coordination[
+        wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
+    ]
+    assert to_remove.shape[0] == 4
+    # Remove them from the wates board
+    wates_coordination = wates_coordination[
+        ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
+    ]
+
     # We combine this into a singular board
     coordinated_packages = pd.concat(
         [
@@ -3662,6 +3672,7 @@ def revised_model():
     )
 
     coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int)
+    assert not coordinated_packages["Organisation Reference"].duplicated().sum()
 
     # Merge the property features on
     coordinated_packages = coordinated_packages.merge(
@@ -3670,6 +3681,25 @@ def revised_model():
         on="Organisation Reference"
     )
 
+    # We match the properties to their closest match
+    # We clean up the SAP ratings in the coordinated packages
+    def sap_to_number(x):
+        try:
+            return int(x)
+        except:
+            if x[-1] in ["A", "B", "C", "D", "E", "F"]:
+                return int(x[:-1])
+
+            if x[0] in ["A", "B", "C", "D", "E", "F"]:
+                return int(x[1:])
+
+    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Band"])]
+    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Rating"])]
+
+    coordinated_packages["Actual SAP Rating"] = coordinated_packages["Actual SAP Rating"].apply(
+        lambda x: sap_to_number(x)
+    )
+
     # We need the features pertaining to these priority postcodes
 
     def find_nearest_matching_property(coordinated_packages, home):
@@ -3729,11 +3759,9 @@ def revised_model():
     no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False)
 
     # len(no_match)
-    # 8764, 5607
+    # 8764, 5607, 5646
     # no_match_summary.shape
-    # (3953, 6), (2948, 6)
-
-    # We match the properties to their closest match
+    # (3953, 6), (2948, 6), (2969, 7)
 
     matches_df = pd.DataFrame(matches)
     matches_df = matches_df.merge(
@@ -3745,11 +3773,36 @@ def revised_model():
     aggregated_matches_df = []
     for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
         if mapped_matches.shape[0] == 1:
-            mapped_matches["Number of matches"] = 1
-            mapped_matches["Proportion"]
-            aggregated_matches_df.append(mapped_matches)
+            aggregated_matches_df.append(
+                {
+                    "Organisation Reference": org_ref,
+                    "Number of matches": 1,
+                    "Proportion": 100,
+                    "Estimated SAP Rating": mapped_matches["Actual SAP Rating"].values[0],
+                    "Estimated EPC Rating": sap_to_epc(mapped_matches["Actual SAP Rating"].values[0])
+                }
+            )
             continue
 
+        # We need to aggregate the matches, since we have multiple
+        average_rating = mapped_matches["Actual SAP Rating"].mean()
+        number_of_matches = mapped_matches.shape[0]
+        average_epc_rating = sap_to_epc(average_rating)
+        # proportion is the number of properties that have this EPC rating
+        proportion_with_this_epc = int(
+            mapped_matches[mapped_matches["Actual SAP Band"] == average_epc_rating].shape[0] / number_of_matches * 100)
+        aggregated_matches_df.append(
+            {
+                "Organisation Reference": org_ref,
+                "Number of matches": number_of_matches,
+                "Proportion": proportion_with_this_epc,
+                "Estimated SAP Rating": average_rating,
+                "Estimated EPC Rating": average_epc_rating
+            }
+        )
+
+    aggregated_matches_df = pd.DataFrame(aggregated_matches_df)
+
     mapped_priority_list = new_priority_postcodes.merge(
         matches_df, on="Organisation Reference",
     )

From 139db23592ea885af14d8734d9cf2e36a1484a59 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 4 Feb 2025 14:04:20 +0000
Subject: [PATCH 19/72] putting together outputs

---
 .../stonewater/Wave 3 Preparation.py          | 346 +++++++++++++++---
 etl/route_march_data_pull/app.py              |  16 +-
 survey_report/app.py                          |  79 ++--
 3 files changed, 360 insertions(+), 81 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index c623e9f7..1748f624 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1,5 +1,6 @@
 import os
-from pyexpat import features
+from urllib import parse
+from fuzzywuzzy import fuzz
 
 import PyPDF2
 import re
@@ -2936,6 +2937,14 @@ def identify_incorrect_packages():
     )
 
 
+def extract_sharepoint_url(x):
+    if pd.isnull(x):
+        return ""
+    return "/".join(parse.urlparse(
+        x.split(" - http")[1]
+    ).path.replace("%20", " ").split("/")[-2:])
+
+
 def revised_model():
     """
     This function implements the revised model for Stonewater, where we are looking at new priority postcodes
@@ -2956,6 +2965,7 @@ def revised_model():
     original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])]
     original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
     original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
+    original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str)
 
     # Check if we have all of the addresses
     missed = original_archetypes[
@@ -2965,7 +2975,7 @@ def revised_model():
     assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'}
 
     original_archetypes = original_archetypes[
-        ["Address ID", "Archetype ID", "Archetype Group Rank"]
+        ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"]
     ]
 
     # Merge these archetypes on to the new priority postcodes
@@ -3104,6 +3114,42 @@ def revised_model():
     # Replace \n with ""
     retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")
 
+    retrofit_assessments_data_columns = [
+        'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)',
+        'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys',
+        'Fuel Bill', 'Window Age Description',
+        'Window Age Description Proportion (%)',
+        'Secondary Window Age Description',
+        'Secondary Window Age Description Proportion (%)', 'Number of Windows',
+        'Total Number of Doors', 'Number of Insulated Doors',
+        'Existing Primary Heating System',
+        'Existing Primary Heating PCDF Reference',
+        'Existing Primary Heating Controls',
+        'Existing Primary Heating % of Heat',
+        'Existing Secondary Heating System',
+        'Existing Secondary Heating PCDF Reference',
+        'Existing Secondary Heating Controls',
+        'Existing Secondary Heating % of Heat', 'Secondary Heating Code',
+        'Water Heating Code', 'Total Floor Area (m2)',
+        'Total Ground Floor Area (m2)', 'RIR Floor Area',
+        'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)',
+        'Number of Light Fittings', 'Number of LEL Fittings',
+        'Number of fittings needing LEL', 'Main Roof Type',
+        'Main Roof Insulation', 'Main Roof Insulation Thickness',
+        'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining',
+        'Main Wall Thickness', 'Main Building Alternative Wall Type',
+        'Main Building Alternative Wall Insulation',
+        'Main Building Alternative Wall Dry-lining',
+        'Main Building Alternative Wall Thickness', 'Main Fuel'
+    ]
+    # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey:
+    retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns]
+    rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed))
+    retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict)
+    retrofit_assessment_data["Survey: Current EPC Band"] = (
+        retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x))
+    )
+
     # We can read in the data as needed
 
     # Next Step: Read in the coordinated measures and match to the extracted data
@@ -3134,14 +3180,6 @@ def revised_model():
     ccs_coordination_sheet = ccs_coordination_sheet.head(87)
     ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet])
 
-    from urllib import parse
-    def extract_sharepoint_url(x):
-        if pd.isnull(x):
-            return ""
-        return "/".join(parse.urlparse(
-            x.split(" - http")[1]
-        ).path.replace("%20", " ").split("/")[-2:])
-
     ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x))
 
     ############################################################
@@ -3224,8 +3262,6 @@ def revised_model():
         lambda x: extract_sharepoint_url(x)
     )
 
-    # Combine the data back
-
     ############################################################
     # NEW 450 COORDINATED RETROFIT ASSESSMENTS
     #############################################################
@@ -3352,7 +3388,6 @@ def revised_model():
     )
     ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])]
     ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"]
-    from fuzzywuzzy import fuzz
 
     ccs_manual_filters = {
         "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35"
@@ -3596,6 +3631,17 @@ def revised_model():
         matching_lookup, how="left", on="Name"
     )
 
+    # We now map the retrofit assessment data to the coordinated packages
+    wates_coordination = wates_coordination.merge(
+        retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+    )
+    ccs_coordination = ccs_coordination.merge(
+        retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+    )
+    retrofit_packages_board = retrofit_packages_board.merge(
+        retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+    )
+
     # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board
     to_remove = wates_coordination[
         wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
@@ -3617,8 +3663,8 @@ def revised_model():
                     'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
                     'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
                     'Solar PV', 'Other measures', 'Organisation Reference',
-                ]
-            ],
+                ] + retrofit_assessments_data_columns_prefixed
+                ],
             ccs_coordination[
                 [
                     # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls,
@@ -3627,8 +3673,8 @@ def revised_model():
                     'SAP Band Install Package', 'Package Approved (Client)',
                     'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
                     'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y",
-                ]
-            ].rename(
+                ] + retrofit_assessments_data_columns_prefixed
+                ].rename(
                 columns={
                     "SAP Band Pre": "Actual SAP Band",
                     "SAP Rating Pre": "Actual SAP Rating",
@@ -3651,8 +3697,8 @@ def revised_model():
                     'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
                     'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x'
 
-                ]
-            ].rename(
+                ] + retrofit_assessments_data_columns_prefixed
+                ].rename(
                 columns={
                     "SAP Band Pre": "Actual SAP Band",
                     "SAP Rating Pre": "Actual SAP Rating",
@@ -3681,24 +3727,8 @@ def revised_model():
         on="Organisation Reference"
     )
 
-    # We match the properties to their closest match
-    # We clean up the SAP ratings in the coordinated packages
-    def sap_to_number(x):
-        try:
-            return int(x)
-        except:
-            if x[-1] in ["A", "B", "C", "D", "E", "F"]:
-                return int(x[:-1])
-
-            if x[0] in ["A", "B", "C", "D", "E", "F"]:
-                return int(x[1:])
-
-    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Band"])]
-    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Rating"])]
-
-    coordinated_packages["Actual SAP Rating"] = coordinated_packages["Actual SAP Rating"].apply(
-        lambda x: sap_to_number(x)
-    )
+    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])]
+    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])]
 
     # We need the features pertaining to these priority postcodes
 
@@ -3721,6 +3751,11 @@ def revised_model():
             if not match.empty:
                 return match
 
+        # Finally, we search for a property in the same Archetype
+        match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]]
+        if not match.empty:
+            return match
+
         return None  # No match found
 
     coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip()
@@ -3732,6 +3767,12 @@ def revised_model():
     coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0]
     new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0]
 
+    coordinated_packages = coordinated_packages.merge(
+        new_priority_postcodes[["Organisation Reference", "Archetype ID"]],
+        how="left",
+        on="Organisation Reference"
+    )
+
     # For every property in the priority postcodes data, we look for a most appropriate matching property
     no_match = []
     matches = []
@@ -3759,16 +3800,17 @@ def revised_model():
     no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False)
 
     # len(no_match)
-    # 8764, 5607, 5646
+    # 8764, 5607, 5646, 5071
     # no_match_summary.shape
-    # (3953, 6), (2948, 6), (2969, 7)
+    # (3953, 6), (2948, 6), (2969, 7), (2575, 7)
 
     matches_df = pd.DataFrame(matches)
     matches_df = matches_df.merge(
-        coordinated_packages[["Organisation Reference", "Actual SAP Band", "Actual SAP Rating"]],
+        coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]],
         left_on="Best Match Organisation Reference", right_on="Organisation Reference",
         suffixes=("", " - Closest Match")
     )
+
     # We want to aggregate the matches, when we have multiple
     aggregated_matches_df = []
     for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
@@ -3778,19 +3820,21 @@ def revised_model():
                     "Organisation Reference": org_ref,
                     "Number of matches": 1,
                     "Proportion": 100,
-                    "Estimated SAP Rating": mapped_matches["Actual SAP Rating"].values[0],
-                    "Estimated EPC Rating": sap_to_epc(mapped_matches["Actual SAP Rating"].values[0])
+                    "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0],
+                    "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0]
                 }
             )
             continue
 
         # We need to aggregate the matches, since we have multiple
-        average_rating = mapped_matches["Actual SAP Rating"].mean()
+        average_rating = mapped_matches["Survey: Current SAP Rating"].mean()
         number_of_matches = mapped_matches.shape[0]
         average_epc_rating = sap_to_epc(average_rating)
         # proportion is the number of properties that have this EPC rating
         proportion_with_this_epc = int(
-            mapped_matches[mapped_matches["Actual SAP Band"] == average_epc_rating].shape[0] / number_of_matches * 100)
+            mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[
+                0] / number_of_matches * 100
+        )
         aggregated_matches_df.append(
             {
                 "Organisation Reference": org_ref,
@@ -3804,12 +3848,220 @@ def revised_model():
     aggregated_matches_df = pd.DataFrame(aggregated_matches_df)
 
     mapped_priority_list = new_priority_postcodes.merge(
-        matches_df, on="Organisation Reference",
+        aggregated_matches_df, on="Organisation Reference", how="left"
     )
-    # We merge on the EPC ratings for the matched properties
-    mapped_priority_list = mapped_priority_list.merge(
 
+    mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0]
+
+    # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0
+
+    def remove_leading_zero(address):
+        return re.sub(r"^0([1-9]) ", r"\1 ", address)
+
+    # Example usage
+    mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero)
+    mapped_priority_list["address1"] = np.where(
+        mapped_priority_list["Organisation Reference"] == 37004,
+        "8 Mason Road",
+        mapped_priority_list["address1"]
     )
+    mapped_priority_list["address1"] = np.where(
+        mapped_priority_list["Organisation Reference"] == 37003,
+        "9 Mason Road",
+        mapped_priority_list["address1"]
+    )
+
+    mapped_priority_list = mapped_priority_list.rename(
+        columns={"UPRN": "uprn"}
+    )
+    mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"]
+
+    # Let's get the newest EPC data for these properties
+    # We merge on UPRN, when we have it
+    # from etl.route_march_data_pull.app import get_data
+    # epc_data, errors, nodata = get_data(
+    #     asset_list=mapped_priority_list,
+    #     fulladdress_column="Address",
+    #     address1_column="address1",
+    #     postcode_column="Postcode",
+    #     manual_uprn_map={},
+    #     epc_api_only=True
+    # )
+    #
+    # epc_df = pd.DataFrame(epc_data)
+    # epc_df.to_csv(
+    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False
+    # )
+    epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"))
+    epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"})
+
+    # We now package up the data
+
+    # Sheet 1 is the base coordination data
+    output_coordination_sheet = coordinated_packages[
+        [
+            "Name", "Postcode", 'Organisation Reference', 'Package Ref',
+            'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+            'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+            'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+            'Solar PV', 'Other measures', 'Survey: Current SAP Rating', 'Survey: Current EPC Band',
+            'Survey: Primary Energy Use (kWh/yr)',
+            'Survey: Primary Energy Use Intensity (kWh/m2/yr)',
+            'Survey: Number of Storeys', 'Survey: Fuel Bill',
+            'Survey: Window Age Description',
+            'Survey: Window Age Description Proportion (%)',
+            'Survey: Secondary Window Age Description',
+            'Survey: Secondary Window Age Description Proportion (%)',
+            'Survey: Number of Windows', 'Survey: Total Number of Doors',
+            'Survey: Number of Insulated Doors',
+            'Survey: Existing Primary Heating System',
+            'Survey: Existing Primary Heating PCDF Reference',
+            'Survey: Existing Primary Heating Controls',
+            'Survey: Existing Primary Heating % of Heat',
+            'Survey: Existing Secondary Heating System',
+            'Survey: Existing Secondary Heating PCDF Reference',
+            'Survey: Existing Secondary Heating Controls',
+            'Survey: Existing Secondary Heating % of Heat',
+            'Survey: Secondary Heating Code', 'Survey: Water Heating Code',
+            'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)',
+            'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)',
+            'Survey: First Extension Wall Area (m2)',
+            'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings',
+            'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type',
+            'Survey: Main Roof Insulation',
+            'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type',
+            'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining',
+            'Survey: Main Wall Thickness',
+            'Survey: Main Building Alternative Wall Type',
+            'Survey: Main Building Alternative Wall Insulation',
+            'Survey: Main Building Alternative Wall Dry-lining',
+            'Survey: Main Building Alternative Wall Thickness',
+            'Survey: Main Fuel',
+            'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type'
+        ]
+    ].rename(
+        columns={
+            'Walls': "Parity - Walls",
+            'Roofs': "Parity - Roof",
+            'Heating': "Parity - Heating",
+            'Main Fuel': "Parity - Fuel",
+            'Age': "Parity - Age Band",
+            'Property Type': "Parity - Property Type"
+        }
+    )
+
+    # Sheet 2 is the lookup table which maps the properties to their closest match
+    # We need to bring in the parity attributes between the mapped properties so we can see side-by-side
+    mapped_lookup = matches_df[
+        [
+            'Organisation Reference',
+            'Best Match Organisation Reference',
+            'Survey: Current EPC Band',
+            'Survey: Current SAP Rating'
+        ]
+    ].rename(
+        columns={
+            'Best Match Organisation Reference': "Best Match - Organisation Reference",
+            "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band",
+            'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating"
+        }
+    ).merge(
+        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]],
+        how="left",
+        on="Organisation Reference"
+    ).merge(
+        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]].rename(
+            columns={
+                "Organisation Reference": "Best Match - Organisation Reference",
+                "Walls": "Best Match - Walls",
+                "Roofs": "Best Match - Roof",
+                "Heating": "Best Match - Heating",
+                "Main Fuel": "Best Match - Main Fuel",
+                "Age": "Best Match - Age",
+                "Property Type": "Best Match - Property Type"
+            }
+        ),
+        how="left",
+        on="Best Match - Organisation Reference"
+    ).merge(
+        coordinated_packages[
+            [
+                "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation',
+                'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness',
+                'Survey: Existing Primary Heating System',
+            ]
+        ].rename(
+            columns={
+                "Organisation Reference": "Best Match - Organisation Reference",
+                'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type',
+                'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation',
+                'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type',
+                'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation',
+                'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness',
+                'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System',
+            }
+        ),
+        how="left",
+        on="Best Match - Organisation Reference"
+    )
+
+    # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data
+    worksheet = mapped_priority_list[
+        [
+            'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID',
+            'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing',
+            'Heating', 'Main Fuel', 'Hot Water', 'Estimated SAP Rating', 'Estimated EPC Rating'
+        ]
+    ].rename(
+        columns={
+            "SAP": "Parity - SAP Rating",
+            "SAP Band": "Parity - EPC Rating",
+            "Property Type": "Parity - Property Type",
+            "Walls": "Parity - Walls",
+            "Roofs": "Parity - Roofs",
+            'Glazing': "Parity - Glazing",
+            'Heating': 'Parity - Heating',
+            'Main Fuel': 'Parity - Main Fuel',
+            'Hot Water': 'Parity - Hot Water',
+        }
+    ).merge(
+        epc_df[
+            [
+                "Organisation Reference",
+                "uprn",
+                "current-energy-efficiency",
+                "current-energy-rating",
+                "lodgement-date",
+                "construction-age-band",
+                "walls-description",
+                "roof-description",
+                "mainheat-description",
+                "windows-description",
+                "hotwater-description",
+                "main-fuel",
+                "total-floor-area",
+            ]
+        ].rename(
+            columns={
+                "uprn": "Last EPC - uprn",
+                "current-energy-efficiency": "Last EPC - SAP Score",
+                "current-energy-rating": "Last EPC - EPC Rating",
+                "lodgement-date": "Last EPC - Date Lodged",
+                "construction-age-band": "Last EPC - Age Band",
+                "walls-description": "Last EPC - Walls",
+                "roof-description": "Last EPC - Roof",
+                "mainheat-description": "Last EPC - Heating",
+                "windows-description": "Last EPC - Windows",
+                "hotwater-description": "Last EPC - Hot Water",
+                "main-fuel": "Last EPC - Main Fuel",
+                "total-floor-area": "Last EPC - Total Floor Area"
+            }
+        ),
+        how="left",
+        on='Organisation Reference'
+    )
+
+    worksheet["Years Since Last EPC"]
 
 # if __name__ == "__main__":
 #     main()
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 247ce98c..3432b744 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -20,7 +20,7 @@ load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
-def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map):
+def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=True):
     epc_data = []
     errors = []
     no_epc = []
@@ -33,6 +33,11 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m
             if house_no is None:
                 house_no = house_number
             uprn = manual_uprn_map.get(full_address, None)
+            if uprn is None and home.get("uprn"):
+                uprn = home["uprn"]
+
+            if pd.isnull(uprn):
+                uprn = None
 
             searcher = SearchEpc(
                 address1=str(house_no),
@@ -88,6 +93,15 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m
                 no_epc.append(home["row_id"])
                 continue
 
+            if epc_api_only:
+                epc = {
+                    "row_id": home["row_id"],
+                    **searcher.newest_epc.copy()
+                }
+
+                epc_data.append(epc)
+                continue
+
             # Look for EPC recommendatons
             try:
                 property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
diff --git a/survey_report/app.py b/survey_report/app.py
index 87ce7864..be31bd52 100644
--- a/survey_report/app.py
+++ b/survey_report/app.py
@@ -1,6 +1,9 @@
 import os
 import PyPDF2
 from string import Template
+
+import pandas as pd
+
 from survey_report.extraction.detect_report_type import detect_report_type
 from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor
 
@@ -34,44 +37,54 @@ def handle():
     :return:
     """
 
-    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2"
+    folders = [
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 4",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 5",
+    ]
+    data = []
+    for data_folder in folders:
 
-    folder_contents = os.listdir(data_folder)
-    # We look for the following files:
-    # Site notes
-    file_mapping = {}
-    for file in folder_contents:
-        # Check if it's a pdf file
-        if not file.endswith(".pdf"):
-            continue
-        filepath = os.path.join(data_folder, file)
-        with (open(filepath, "rb") as f):
-            pdf = PyPDF2.PdfReader(f)
-            first_page = pdf.pages[0].extract_text()
-            text = ""
-            for page in pdf.pages:
-                text += page.extract_text()
+        folder_contents = os.listdir(data_folder)
+        # We look for the following files:
+        # Site notes
+        file_mapping = {}
+        for file in folder_contents:
+            # Check if it's a pdf file
+            if not file.endswith(".pdf"):
+                continue
+            filepath = os.path.join(data_folder, file)
+            with (open(filepath, "rb") as f):
+                pdf = PyPDF2.PdfReader(f)
+                first_page = pdf.pages[0].extract_text()
+                text = ""
+                for page in pdf.pages:
+                    text += page.extract_text()
 
-        # Check the report type
-        report_type = detect_report_type(first_page)
-        if report_type is not None:
-            file_mapping[report_type] = text
+            # Check the report type
+            report_type = detect_report_type(first_page)
+            if report_type is not None:
+                file_mapping[report_type] = text
 
-    # This is only set up to work with quido site notes so we must have it
-    site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
-    site_notes = site_notes_extractor.extract_all()
+        # This is only set up to work with quido site notes so we must have it
+        site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
+        site_notes = site_notes_extractor.extract_all()
 
-    # We also must have an EPR
-    epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
-    epr = epr_extractor.extract_all()
+        # We also must have an EPR
+        epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
+        epr = epr_extractor.extract_all()
 
-    # We now produce the combined data sheet which is the starting figure:
-    data_sheet = {**epr, **site_notes}
-    del data_sheet['Building Dimensions']
-    # We unnest the Total Building Dimensions
-    data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
-    data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
-    del data_sheet["Total Building Dimensions"]
+        # We now produce the combined data sheet which is the starting figure:
+        data_sheet = {**epr, **site_notes}
+        del data_sheet['Building Dimensions']
+        # We unnest the Total Building Dimensions
+        data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
+        data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
+        del data_sheet["Total Building Dimensions"]
+        data.append(data_sheet)
+    data = pd.DataFrame(data)
 
     # Generate the HTML report
     # Placeholder locations

From 7885467fa40240a2a2632b4b6e120cce5a047c61 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 4 Feb 2025 14:35:24 +0000
Subject: [PATCH 20/72] formatting output

---
 .../stonewater/Wave 3 Preparation.py          | 42 +++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 1748f624..fcde164e 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3777,6 +3777,21 @@ def revised_model():
     no_match = []
     matches = []
     for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)):
+
+        # We check if the property was surveyed
+        survey_result = coordinated_packages[
+            coordinated_packages["Organisation Reference"] == home["Organisation Reference"]
+            ]
+        if not survey_result.empty:
+            to_extend = [
+                {
+                    "Organisation Reference": home["Organisation Reference"],
+                    "Best Match Organisation Reference": m,
+                    "Was Surveyed": True
+                } for m in survey_result["Organisation Reference"].values
+            ]
+            matches.extend(to_extend)
+
         closest_match = find_nearest_matching_property(coordinated_packages, home)
         if closest_match is None:
             no_match.append(home["Organisation Reference"])
@@ -3785,7 +3800,8 @@ def revised_model():
         to_extend = [
             {
                 "Organisation Reference": home["Organisation Reference"],
-                "Best Match Organisation Reference": m
+                "Best Match Organisation Reference": m,
+                "Was Surveyed": False
             } for m in closest_match["Organisation Reference"].values
         ]
         matches.extend(to_extend)
@@ -4010,7 +4026,8 @@ def revised_model():
         [
             'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID',
             'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing',
-            'Heating', 'Main Fuel', 'Hot Water', 'Estimated SAP Rating', 'Estimated EPC Rating'
+            'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion',
+            'Estimated SAP Rating', 'Estimated EPC Rating'
         ]
     ].rename(
         columns={
@@ -4023,6 +4040,7 @@ def revised_model():
             'Heating': 'Parity - Heating',
             'Main Fuel': 'Parity - Main Fuel',
             'Hot Water': 'Parity - Hot Water',
+            'Proportion': 'Proportion of matched properties with same EPC rating',
         }
     ).merge(
         epc_df[
@@ -4061,7 +4079,25 @@ def revised_model():
         on='Organisation Reference'
     )
 
-    worksheet["Years Since Last EPC"]
+    worksheet["Years Since Last EPC"] = pd.Timestamp.now().year - pd.to_datetime(
+        worksheet["Last EPC - Date Lodged"]).dt.year
+
+    worksheet["Last EPC - uprn"] = worksheet["Last EPC - uprn"].astype("Int64").astype(str)
+
+    worksheet["uprn"] = np.where(
+        pd.isnull(worksheet["uprn"]) & pd.notnull(worksheet["Last EPC - uprn"]),
+        worksheet["Last EPC - uprn"],
+        worksheet["uprn"]
+    )
+
+    worksheet["uprn"] = worksheet["uprn"].replace("<NA>", "")
+
+    # Save to Excel with multiple sheets
+    excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx")
+    with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
+        worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True)
+        mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True)
+        output_coordination_sheet.to_excel(writer, sheet_name="Coordination", index=False, header=True)
 
 # if __name__ == "__main__":
 #     main()

From 77844c625eb1b00f140c1f64224b4101a51e1ca5 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 10 Feb 2025 15:41:33 +0000
Subject: [PATCH 21/72] minor

---
 etl/customers/panacap/assets.py               |  61 +++++
 etl/customers/remote_assessments/app.py       |  34 +--
 .../stonewater/Wave 3 Preparation.py          |  16 +-
 .../stonewater/potential_eco_properties.py    | 250 ++++++++----------
 etl/find_my_epc/RetrieveFindMyEpc.py          |  19 +-
 etl/route_march_data_pull/app.py              | 149 ++++++++---
 recommendations/Recommendations.py            |   2 +
 7 files changed, 324 insertions(+), 207 deletions(-)
 create mode 100644 etl/customers/panacap/assets.py

diff --git a/etl/customers/panacap/assets.py b/etl/customers/panacap/assets.py
new file mode 100644
index 00000000..ec57d9a4
--- /dev/null
+++ b/etl/customers/panacap/assets.py
@@ -0,0 +1,61 @@
+import os
+
+import pandas as pd
+from dotenv import load_dotenv
+
+from etl.spatial.OpenUprnClient import OpenUprnClient
+from etl.route_march_data_pull.app import get_data
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+addresses = [
+    {"address": "3 Willis Road", "postcode": "CB1 2AQ"},
+    {"address": "22 Catharine Street", "postcode": "CB1 3AW"},
+    {"address": "332 Mill Road", "postcode": "CB1 3NN"},
+    {"address": "330 Mill Road", "postcode": "CB1 3NN"},
+    {"address": "328 Mill Road", "postcode": "CB1 3NN"},
+    {"address": "71 Mill Road", "postcode": "CB1 2AS"},
+    {"address": "78 Argyle Street", "postcode": "CB1 3LZ"},
+    {"address": "9 Graham Road", "postcode": "CB4 2ZE"},
+    {"address": "217 Mill Road", "postcode": "CB1 3BE"},
+    {"address": "374 Mill Road", "postcode": "CB1 3NN"},
+    {"address": "174 Thoday Street", "postcode": "CB1 3AX"},
+    {"address": "37 Abbey Road", "postcode": "CB5 8HH"},
+    {"address": "18 Upper Gwydir Street", "postcode": "CB1 2LR"},
+    {"address": "21 Fulbourn Road Fulbourn", "postcode": "CB1 9JL"},
+    {"address": "108 Argyle Street", "postcode": "CB1 3LS"},
+    {"address": "115 Victoria Road", "postcode": "CB4 3BS"},
+    {"address": "55 Ross Street", "postcode": "CB1 3BP"},
+    {"address": "16 Kingston Street", "postcode": "CB1 2NU"},
+    {"address": "13 Thoday Street", "postcode": "CB1 3AS"},
+    {"address": "103 York Street", "postcode": "CB1 2PZ"},
+]
+
+asset_list = pd.DataFrame(addresses)
+asset_list["row_id"] = asset_list.index
+
+epc_data, _, _ = get_data(
+    asset_list=asset_list, fulladdress_column="address", postcode_column="postcode", address1_column="address",
+    manual_uprn_map={}, epc_api_only=True
+)
+
+epc_df = pd.DataFrame(epc_data)
+epc_df.shape
+
+asset_list = asset_list.merge(
+    epc_df, how="left", on="row_id"
+)
+
+asset_list = asset_list.rename(columns={"address_x": "Address", "postcode_x": "Postcode"})
+asset_list["uprn"] = asset_list["uprn"].astype(str)
+
+spatial_data = OpenUprnClient.get_spatial_data([x["uprn"] for x in epc_data], bucket_name="retrofit-data-dev")
+spatial_data["UPRN"] = spatial_data["UPRN"].astype(str)
+
+asset_list = asset_list.merge(
+    spatial_data, how="left", left_on="uprn", right_on="UPRN"
+)
+
+asset_list.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Panacap/Acquisitions EPC Data.csv",
+                  index=False)
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index 13cdc41b..e1298565 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
 from utils.s3 import save_csv_to_s3
 from etl.find_my_epc.AssetListEpcData import AssetListEpcData
 
-PORTFOLIO_ID = 126
+PORTFOLIO_ID = 127
 USER_ID = 8
 
 load_dotenv(dotenv_path="backend/.env")
@@ -19,22 +19,9 @@ def app():
 
     asset_list = [
         {
-            "address": "Garden Flat, 48 Bedminster Parade",
-            "postcode": "BS3 4HS",
-            "building_id": 1,
-            "uprn": 308249,
-        },
-        {
-            "address": "Top Floor Flat, 48 Bedminster Parade",
-            "postcode": "BS3 4HS",
-            "building_id": 1,
-            "uprn": 308251
-        },
-        {
-            "address": "First Floor Flat, 48 Bedminster Parade",
-            "postcode": "BS3 4HS",
-            "building_id": 1,
-            "uprn": 308250,
+            "address": "49 Brailsford Road",
+            "postcode": "M14 6PT",
+            "uprn": 77145666,
         }
     ]
     asset_list = pd.DataFrame(asset_list)
@@ -65,18 +52,7 @@ def app():
 
     valuation_data = [
         {
-            "address": "Garden Flat, 48 Bedminster Parade",
-            "postcode": "BS3 4HS",
-            "valuation": 337_000
-        },
-        {
-            "addresss": "Top Floor Flat, 48 Bedminster Parade",
-            "postcode": "BS3 4HS",
-            "valuation": 337_000
-        },
-        {
-            "address": "First Floor Flat, 48 Bedminster Parade",
-            "postcode": "BS3 4HS",
+            "uprn": 77145666,
             "valuation": 337_000
         }
     ]
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index fcde164e..b2a92e4c 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3777,7 +3777,6 @@ def revised_model():
     no_match = []
     matches = []
     for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)):
-
         # We check if the property was surveyed
         survey_result = coordinated_packages[
             coordinated_packages["Organisation Reference"] == home["Organisation Reference"]
@@ -3791,6 +3790,7 @@ def revised_model():
                 } for m in survey_result["Organisation Reference"].values
             ]
             matches.extend(to_extend)
+            continue
 
         closest_match = find_nearest_matching_property(coordinated_packages, home)
         if closest_match is None:
@@ -3821,6 +3821,7 @@ def revised_model():
     # (3953, 6), (2948, 6), (2969, 7), (2575, 7)
 
     matches_df = pd.DataFrame(matches)
+
     matches_df = matches_df.merge(
         coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]],
         left_on="Best Match Organisation Reference", right_on="Organisation Reference",
@@ -3837,7 +3838,8 @@ def revised_model():
                     "Number of matches": 1,
                     "Proportion": 100,
                     "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0],
-                    "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0]
+                    "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0],
+                    "Was Surveyed": mapped_matches["Was Surveyed"].values[0],
                 }
             )
             continue
@@ -3857,7 +3859,8 @@ def revised_model():
                 "Number of matches": number_of_matches,
                 "Proportion": proportion_with_this_epc,
                 "Estimated SAP Rating": average_rating,
-                "Estimated EPC Rating": average_epc_rating
+                "Estimated EPC Rating": average_epc_rating,
+                "Was Surveyed": False
             }
         )
 
@@ -3973,7 +3976,8 @@ def revised_model():
             'Organisation Reference',
             'Best Match Organisation Reference',
             'Survey: Current EPC Band',
-            'Survey: Current SAP Rating'
+            'Survey: Current SAP Rating',
+            "Was Surveyed"
         ]
     ].rename(
         columns={
@@ -4027,7 +4031,7 @@ def revised_model():
             'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID',
             'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing',
             'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion',
-            'Estimated SAP Rating', 'Estimated EPC Rating'
+            'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed"
         ]
     ].rename(
         columns={
@@ -4092,6 +4096,8 @@ def revised_model():
 
     worksheet["uprn"] = worksheet["uprn"].replace("<NA>", "")
 
+    worksheet = worksheet.drop(columns=["Last EPC - uprn"])
+
     # Save to Excel with multiple sheets
     excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx")
     with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py
index bda9c30c..eef82eae 100644
--- a/etl/customers/stonewater/potential_eco_properties.py
+++ b/etl/customers/stonewater/potential_eco_properties.py
@@ -217,78 +217,7 @@ def app():
     )
     )
 
-    # We get the EPC data
-    # epc_data = json.loads(
-    #     read_from_s3(
-    #         bucket_name="retrofit-data-dev",
-    #         s3_file_name="customers/Stonewater/clustering/epc_data.json"
-    #     )
-    # )
-    # epc_data = pd.DataFrame(epc_data)
-    #
-    # epc_data["uprn"] = np.where(
-    #     epc_data["internal_id"] == 1091,
-    #     83143766,
-    #     epc_data["uprn"]
-    # )
-    #
-    # epc_data_batch_2 = read_pickle_from_s3(
-    #     s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
-    #     bucket_name="retrofit-data-dev"
-    # )
-    # epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
-    #
-    # complete_epcs = pd.concat([epc_data, epc_data_batch_2])
-    #
-    # epcs_to_merge = complete_epcs[
-    #     [
-    #         "uprn",
-    #         "address",
-    #         "postcode",
-    #         "property-type",
-    #         "built-form",
-    #         "inspection-date",
-    #         "current-energy-rating",
-    #         "current-energy-efficiency",
-    #         "roof-description",
-    #         "walls-description",
-    #         "transaction-type",
-    #         "secondheat-description",
-    #         "total-floor-area",
-    #         "construction-age-band",
-    #         "floor-height",
-    #         "number-habitable-rooms",
-    #         "mainheat-description",
-    #         "energy-consumption-current"
-    #     ]
-    # ].rename(
-    #     columns={
-    #         "address": "Address",
-    #         "postcode": "Postcode",
-    #         "inspection-date": "Date of last EPC",
-    #         "current-energy-efficiency": "SAP score on register",
-    #         "current-energy-rating": "EPC rating on register",
-    #         "property-type": "Property Type",
-    #         "built-form": "Archetype",
-    #         "total-floor-area": "Property Floor Area",
-    #         "construction-age-band": "Property Age Band",
-    #         "floor-height": "Property Floor Height",
-    #         "number-habitable-rooms": "Number of Habitable Rooms",
-    #         "walls-description": "Wall Construction",
-    #         "roof-description": "Roof Construction",
-    #         "mainheat-description": "Heating Type",
-    #         "secondheat-description": "Secondary Heating",
-    #         "transaction-type": "Reason for last EPC",
-    #         "energy-consumption-current": "Heat Demand (kWh/m2)",
-    #     }
-    # )
-    # # We de-dupe, taking the newest on the date the EPC was lod
-    # epcs_to_merge["Date of last EPC"] = pd.to_datetime(epcs_to_merge["Date of last EPC"])
-    # epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False)
-    # epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn")
-
     stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str)
-    stonewater_cavity_properties["Reason Included"].value_counts()
     # Find the postcodes where an Osmosis survey revealed a need for CWI
     postcodes_found_needing_cwi = stonewater_cavity_properties[
         stonewater_cavity_properties["Reason Included"].isin(
@@ -339,12 +268,7 @@ def app():
             "Renewables": "Parity - Renewables",
             "Total Floor Area": "Parity - Total Floor Area"
         }
-    )  # .merge(
-    #     epcs_to_merge,
-    #     how="left",
-    #     left_on="UPRN",
-    #     right_on="uprn"
-    # )
+    )
 
     # We now flag the additional properties in the as built list
 
@@ -434,12 +358,11 @@ def app():
 
     additional_properties["Suspected Needs CWI - not surveyed"] = (
         (
-            additional_properties["Postcode"].isin(postcodes_found_needing_cwi)
+            additional_properties["Postcode"].isin(postcodes_found_needing_cwi) &
+            ~additional_properties["Installed under ECO3"]
         )
     )
 
-    additional_properties["Same Postcode as Installed under ECO3"].value_counts()
-
     # We drop Full Address
     additional_properties = additional_properties.drop(columns=["Full Address"])
     additional_properties2 = additional_properties[[
@@ -461,65 +384,57 @@ def app():
             "Renewables": "Parity - Renewables",
             "Total Floor Area": "Parity - Total Floor Area"
         }
-    )  # .merge(
-    #     pd.DataFrame(additional_properties_epcs)[
-    #         [
-    #             "row_id",
-    #             "property-type",
-    #             "built-form",
-    #             "inspection-date",
-    #             "current-energy-rating",
-    #             "current-energy-efficiency",
-    #             "roof-description",
-    #             "walls-description",
-    #             "transaction-type",
-    #             "secondheat-description",
-    #             "total-floor-area",
-    #             "construction-age-band",
-    #             "floor-height",
-    #             "number-habitable-rooms",
-    #             "mainheat-description",
-    #             "energy-consumption-current"
-    #         ]
-    #     ].rename(
-    #         columns={
-    #             "inspection-date": "Date of last EPC",
-    #             "current-energy-efficiency": "SAP score on register",
-    #             "current-energy-rating": "EPC rating on register",
-    #             "property-type": "Property Type",
-    #             "built-form": "Archetype",
-    #             "total-floor-area": "Property Floor Area",
-    #             "construction-age-band": "Property Age Band",
-    #             "floor-height": "Property Floor Height",
-    #             "number-habitable-rooms": "Number of Habitable Rooms",
-    #             "walls-description": "Wall Construction",
-    #             "roof-description": "Roof Construction",
-    #             "mainheat-description": "Heating Type",
-    #             "secondheat-description": "Secondary Heating",
-    #             "transaction-type": "Reason for last EPC",
-    #             "energy-consumption-current": "Heat Demand (kWh/m2)",
-    #         }
-    #     ),
-    #     how="left",
-    #     on="row_id"
-    # )
+    )
+
+    # Combine the data:
+    full_dataset = pd.concat([stonewater_cavity_properties, additional_properties2])
+
+    # We not define the priority list for non-intrusives
+    full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2]
+    full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0]
+
+    # Strip out anything we definitely don't want
+    full_dataset = full_dataset[~full_dataset["Installed under ECO3"]]
+
+    areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique()
+
+    priorities = full_dataset[
+        full_dataset["Postal Region 2"].isin(areas)
+    ]
+
+    region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index()
+    region_prevalance = region_prevalance[region_prevalance["count"] > 100]
+    df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)]
+
+    df["Postal Region"].value_counts()
+    df["Postal Region 2"].value_counts()
+
+    if df["Installed under ECO3"].sum():
+        raise ValueError("There are properties in the priority list that were installed under ECO3")
+
+    df.to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - "
+        "revised list.xlsx",
+        index=False
+    )
 
     # We save the data locally
-    stonewater_cavity_properties.to_csv(
-        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority "
-        "postcodes.csv",
-        index=False
-    )
-    additional_properties2.to_csv(
-        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - "
-        "non-priority postcodes.csv",
-        index=False
-    )
-    # Save the survey findings
-    needs_cwi.to_csv(
-        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
-        index=False
-    )
+    # stonewater_cavity_properties.to_csv(
+    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority "
+    #     "postcodes.csv",
+    #     index=False
+    # )
+    # additional_properties2.to_csv(
+    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - "
+    #     "non-priority postcodes.csv",
+    #     index=False
+    # )
+    # # Save the survey findings
+    # needs_cwi.to_csv(
+    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI -
+    #     WIP.csv",
+    #     index=False
+    # )
 
 
 def cross_reference_epc_programme():
@@ -528,6 +443,12 @@ def cross_reference_epc_programme():
         "SURVEYED - ECO3 NOT COMPLETED.xlsx"
     )
 
+    for _, x in eco3_fallout.iterrows():
+        house_no = SearchEpc.get_house_number(x["ADDRESS"], "")
+        if house_no is None:
+            house_no = x["ADDRESS"].split(",")[0]
+        x["house_number"] = house_no
+
     eco3_fallout["house_number"] = eco3_fallout.apply(
         lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
     )
@@ -558,3 +479,58 @@ def cross_reference_epc_programme():
             stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
         ]
         match.head()
+
+
+def finalise_list_for_non_intrusives():
+    non_intrusives_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater "
+        "Non-Intrusives.xlsx"
+    )
+
+    # Remove anything installed under ECO3
+    non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]]
+
+    # We make any properties that were surveyed by Osmosis
+    packages = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 "
+        "(1).xlsx",
+        header=13,
+        sheet_name="Modelled Packages"
+    )
+
+    non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin(
+        packages["Address ID"].values
+    )
+    # Removed 54 addresses
+    final_non_intrusives = non_intrusives_list[
+        ~non_intrusives_list["Surveyed by Osmosis"]
+    ]
+
+    features = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+        "master sheet.csv",
+        encoding='latin1'
+    )
+
+    # Add on the orgnisaion reference
+    final_non_intrusives = final_non_intrusives.merge(
+        features[["Organisation Reference", "Address ID"]],
+        how="left",
+        on="Address ID"
+    )
+
+    final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2]
+    selected_regions = final_non_intrusives[
+        final_non_intrusives["Include in non-intrusives"]
+    ]["Postcode"].unique()
+
+    final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions)
+
+    # Filter down:
+    final_non_intrusives = final_non_intrusives[
+        final_non_intrusives["Is in region"]
+    ]
+
+    final_non_intrusives.to_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives "
+        "List - final.xlsx")
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
index f93a5a73..eaba1058 100644
--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -25,6 +25,7 @@ class RetrieveFindMyEpc:
         self.postcode = postcode
 
         self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()
+        self.walls = []
 
     @staticmethod
     def extract_low_carbon_sources(soup):
@@ -102,6 +103,8 @@ class RetrieveFindMyEpc:
         # 2) Bills estimates
         # 3) Recommendations and SAP points
         # 4) Low and zero carbon energy sources
+        # 5) The wall types of the property - used for determining if we have an extension wall insulation#
+        #    recommendation
 
         ratings = address_res.find('desc', {'id': 'svg-desc'}).text
         current_rating = ratings.split(".")[0]
@@ -208,6 +211,17 @@ class RetrieveFindMyEpc:
             if key not in assessment_data:
                 raise ValueError(f"Missing key: {key}")
 
+        # The wall types of the property
+        property_features_table = address_res.find("tbody", class_="govuk-table__body")
+        property_features_table = property_features_table.find_all("tr")
+
+        # Extract wall types
+        self.walls = []
+        for row in property_features_table:
+            cells = row.find_all("td")
+            if row.find("th").text.strip() == "Wall":
+                self.walls.append(cells[0].text.strip())
+
         # Finally, we format the recommendations
         recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date)
 
@@ -229,8 +243,7 @@ class RetrieveFindMyEpc:
 
         return resulting_data
 
-    @staticmethod
-    def format_recommendations(recommendations, assessment_data, sap_2012_date=None):
+    def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None):
         """
         This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey
         :param recommendations: The recommendations from the EPC
@@ -330,6 +343,8 @@ class RetrieveFindMyEpc:
         for rec in recommendations:
             mapped = measure_map[rec["measure"]]
             for measure in mapped:
+                if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower():
+                    measure = "extension_cavity_wall_insulation"
                 to_append = {
                     "type": measure,
                     "sap_points": rec["sap_points"],
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 3432b744..cc50caae 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -1,5 +1,6 @@
 import os
 import time
+import pickle
 
 import pandas as pd
 import numpy as np
@@ -20,7 +21,7 @@ load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
-def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=True):
+def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=False):
     epc_data = []
     errors = []
     no_epc = []
@@ -116,10 +117,14 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m
                 find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
             except ValueError as e:
                 if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
-                    find_epc_searcher = RetrieveFindMyEpc(
-                        address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
-                    )
-                    find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+                    try:
+                        find_epc_searcher = RetrieveFindMyEpc(
+                            address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
+                        )
+                        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+                    except ValueError as e:
+                        if "No EPC found" in str(e):
+                            find_epc_data = {}
                 else:
                     find_epc_data = {}
             except Exception as e:
@@ -176,19 +181,33 @@ def app():
     Property UPRN
 
     """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing"
-    DATA_FILENAME = "For Housing Data pull.xlsx"
-    SHEET_NAME = "Sheet1"
-    POSTCODE_COLUMN = "Post Code"
-    FULLADDRESS_COLUMN = None
-    ADDRESS1_COLUMN = "NO."
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People"
+    DATA_FILENAME = "Regulated Stock - Do Not Change (06.06.24).xlsx"
+    SHEET_NAME = "Assets 1"
+    POSTCODE_COLUMN = "Postcode"
+    FULLADDRESS_COLUMN = "Address"
+    ADDRESS1_COLUMN = "AddressLine1"
     ADDRESS1_METHOD = None
-    ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"]
+    ADDRESS_COLS_TO_CONCAT = []
+    MISSING_POSTCODES_METHOD = None
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
 
     asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
+
+    if MISSING_POSTCODES_METHOD is not None:
+        if MISSING_POSTCODES_METHOD == "last_two_words":
+            # Replace any double spaces
+            asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('  ', ' ', regex=False)
+            asset_list["Postcode"] = np.where(
+                pd.isnull(asset_list["Postcode"]),
+                asset_list[FULLADDRESS_COLUMN].str.split(" ").str[-2:].str.join(" "),
+                asset_list["Postcode"]
+            )
+        else:
+            raise ValueError(f"Method {MISSING_POSTCODES_METHOD} not recognized")
+
     asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
     asset_list["row_id"] = asset_list.index
 
@@ -217,29 +236,46 @@ def app():
         asset_list = asset_list[~asset_list["deduper"].duplicated()]
     asset_list = asset_list.drop(columns=["deduper"])
 
-    epc_data, errors, no_epc = get_data(
-        asset_list=asset_list,
-        fulladdress_column=FULLADDRESS_COLUMN,
-        address1_column=ADDRESS1_COLUMN,
-        postcode_column=POSTCODE_COLUMN,
-        manual_uprn_map=MANUAL_UPRN_MAP
-    )
+    # We chunk up this data into 5000 rows at a time
+    chunk_size = 5000
+    epc_data = []
+    errors = []
+    no_epc = []
+    skip = None  # Used to skip already completed chunks
+    for i in range(0, len(asset_list), chunk_size):
+        print(f"Processing chunk {i} to {i + chunk_size}")
+        if skip is not None:
+            if i <= skip:
+                continue
+        chunk = asset_list[i:i + chunk_size]
+        epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
+            asset_list=chunk,
+            fulladdress_column=FULLADDRESS_COLUMN,
+            address1_column=ADDRESS1_COLUMN,
+            postcode_column=POSTCODE_COLUMN,
+            manual_uprn_map=MANUAL_UPRN_MAP
+        )
 
-    # We now retrieve any failed properties
-    asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
-    epc_data_failed, _, _ = get_data(
-        asset_list=asset_list_failed,
-        fulladdress_column=FULLADDRESS_COLUMN,
-        address1_column=ADDRESS1_COLUMN,
-        postcode_column=POSTCODE_COLUMN,
-        manual_uprn_map=MANUAL_UPRN_MAP
-    )
+        # We now retrieve any failed properties
+        chunk_failed = chunk[chunk["row_id"].isin(errors)]
+        epc_data_failed, _, _ = get_data(
+            asset_list=chunk_failed,
+            fulladdress_column=FULLADDRESS_COLUMN,
+            address1_column=ADDRESS1_COLUMN,
+            postcode_column=POSTCODE_COLUMN,
+            manual_uprn_map=MANUAL_UPRN_MAP,
+            epc_api_only=False
+        )
 
-    no_data = asset_list[asset_list["row_id"].isin(no_epc)]
-    print(no_data[[FULLADDRESS_COLUMN, POSTCODE_COLUMN]])
+        epc_data_chunk.extend(epc_data_failed)
+        errors.extend(errors_chunk)
+        no_epc.extend(no_epc_chunk)
 
-    # Append the failed data to the main data
-    epc_data.extend(epc_data_failed)
+        # Append the failed data to the main data
+        # Store the chunk locally as a csv
+        pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)
+
+        epc_data.extend(epc_data_chunk)
 
     epc_df = pd.DataFrame(epc_data)
 
@@ -339,7 +375,7 @@ def app():
         "current-energy-efficiency": "SAP score on register",
         "current-energy-rating": "EPC rating on register",
         "property-type": "Property Type",
-        "built-form": "Archetype",
+        "built-form": "Archetype - EPC",
         "total-floor-area": "Property Floor Area",
         "construction-age-band": "Property Age Band",
         "floor-height": "Property Floor Height",
@@ -375,7 +411,7 @@ def app():
             num_floors=x["Estimated Number of Floors"],
             floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
             perimeter=x["Estimated Perimeter (m)"],
-            built_form=x["Archetype"]
+            built_form=x["Archetype - EPC"]
         ),
         axis=1
     )
@@ -406,3 +442,48 @@ def app():
     matches_review = asset_list[
         [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
     ]
+
+
+import requests
+import base64
+
+API_KEY = "c4afe10370d67eeaa44f067dd37d115263f6c90e"
+URL = "https://epc.opendatacommunities.org/api/v1/domestic/search?size=20"
+email = "itskruel@gmail.com"
+
+AUTH_TOKEN = base64.b64encode(
+    ":".join([email, API_KEY]).encode("utf-8")
+)
+
+AUTH_TOKEN = "aXRza3J1ZWxAZ21haWwuY29tOmM0YWZlMTAzNzBkNjdlZWFhNDRmMDY3ZGQzN2QxMTUyNjNmNmM5MGU="
+
+headers = {
+    "Authorization": "Basic {auth_token}".format(auth_token=AUTH_TOKEN),
+    "Accept": "application/json",
+}
+
+params = {
+    "UPRN": "766024370"
+}
+
+response = requests.get(url="https://epc.opendatacommunities.org/api/v1/domestic/search?size=20&UPRN=766024370",
+                        headers=headers)
+response.json()
+
+data = response.json()
+
+from operator import itemgetter
+
+newest = sorted(data["rows"], key=itemgetter('lodgement-date'))
+data["rows"][0]["lodgement-date"]
+data["rows"][1]["lodgement-date"]
+
+import pandas as pd
+
+df = pd.DataFrame(data["rows"])
+
+df["uprn"].values[2]
+
+df[df["uprn"] == "3455035000"]["property-type"]
+
+from backend.apis.GoogleSolarApi import GoogleSolarApi
diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py
index 15614a0b..03e651e8 100644
--- a/recommendations/Recommendations.py
+++ b/recommendations/Recommendations.py
@@ -503,7 +503,9 @@ class Recommendations:
                         impact_summary.append(
                             {
                                 "phase": rec["phase"],
+                                "representative": rec["recommendation_id"] in representative_ids,
                                 "recommendation_id": rec["recommendation_id"],
+                                "measure_type": rec["measure_type"],
                                 "sap": sap + rec["sap_points"],
                                 "carbon": carbon - rec["co2_equivalent_savings"],
                                 "heat_demand": heat_demand - rec["heat_demand"],

From 61544d01db865af74608e8d2e9d1ea3e9d727dde Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 12 Feb 2025 10:14:14 +0000
Subject: [PATCH 22/72] updating data pull code

---
 .idea/Model.iml                               |   2 +-
 .idea/misc.xml                                |   2 +-
 etl/customers/remote_assessments/app.py       |  10 +-
 .../stonewater/potential_eco_properties.py    |  12 +-
 etl/route_march_data_pull/app.py              | 322 ++++++++++++++----
 5 files changed, 274 insertions(+), 74 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 762580d9..df6c4faa 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index c916a158..50cad4ca 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index e1298565..f32dcea6 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -19,9 +19,9 @@ def app():
 
     asset_list = [
         {
-            "address": "49 Brailsford Road",
-            "postcode": "M14 6PT",
-            "uprn": 77145666,
+            "address": "19 Hillcrest Court",
+            "postcode": "IP21 4YJ",
+            "uprn": 2630134524,
         }
     ]
     asset_list = pd.DataFrame(asset_list)
@@ -52,8 +52,8 @@ def app():
 
     valuation_data = [
         {
-            "uprn": 77145666,
-            "valuation": 337_000
+            "uprn": 2630134524,
+            "valuation": 96_000
         }
     ]
     # Store valuation data to s3
diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py
index eef82eae..6666ce15 100644
--- a/etl/customers/stonewater/potential_eco_properties.py
+++ b/etl/customers/stonewater/potential_eco_properties.py
@@ -368,9 +368,10 @@ def app():
     additional_properties2 = additional_properties[[
         "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
         "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3',
-        'Same Postcode as Installed under ECO3'
+        'Same Postcode as Installed under ECO3', "Organisation Reference",
     ]].rename(
         columns={
+            "Organisation Reference": "Org. ref.",
             "SAP": "Parity - Predicted SAP",
             "SAP Band": "Parity - Predicted SAP Band",
             "Age": "Parity - Build Age",
@@ -387,7 +388,12 @@ def app():
     )
 
     # Combine the data:
-    full_dataset = pd.concat([stonewater_cavity_properties, additional_properties2])
+
+    stonewater_cavity_properties2 = stonewater_cavity_properties.merge(
+        features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference"
+    )
+    full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2])
+    full_dataset = full_dataset.drop(columns=['Osm. ID'])
 
     # We not define the priority list for non-intrusives
     full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2]
@@ -414,7 +420,7 @@ def app():
 
     df.to_csv(
         "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - "
-        "revised list.xlsx",
+        "revised list.csv",
         index=False
     )
 
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index cc50caae..dba85b3f 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -1,7 +1,6 @@
 import os
 import time
-import pickle
-
+from BaseUtility import Definitions
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
@@ -17,6 +16,10 @@ from recommendations.recommendation_utils import (
     estimate_number_of_floors
 )
 
+from etl.epc_clean.epc_attributes.attribute_utils import (
+    extract_thermal_transmittance
+)
+
 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
@@ -158,6 +161,53 @@ def extract_address1(asset_list, full_address_col, method="first_two_words"):
     raise ValueError(f"Method {method} not recognized")
 
 
+def process_age_band(x, year_built_column):
+    year_built = float(x[year_built_column])
+
+    if pd.isnull(x["Property Age Band"]) or (
+        x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
+    ) or pd.isnull(year_built):
+        return "No EPC Age Band"
+
+    # We check if we have a numeric data
+    if x["Property Age Band"].isdigit():
+        if year_built == float(x["Property Age Band"]):
+            return "EPC Age Band Matches Year Built"
+        if year_built > float(x["Property Age Band"]):
+            return "EPC Age Band is older than Year Built"
+        if year_built < float(x["Property Age Band"]):
+            return "EPC Age Band is newer than Year Built"
+
+    # Handle specific case
+    if x["Property Age Band"] == "England and Wales: 2007 onwards":
+        if year_built >= 2007:
+            return "EPC Age Band Matches Year Built"
+        if year_built < 2007:
+            return "EPC Age Band is older than Year Built"
+
+    if x["Property Age Band"] == "England and Wales: before 1900":
+        if year_built < 1900:
+            return "EPC Age Band Matches Year Built"
+        if year_built >= 1900:
+            return "EPC Age Band is newer than Year Built"
+
+    # Age band will be formatted as such:
+    # 'England and Wales: {upper date}-{lower date}'
+    # so we extract the lower and upper date
+    age_band = x["Property Age Band"].split(": ")[1]
+    lower_date, upper_date = age_band.split("-")
+    if year_built <= float(upper_date) and year_built <= float(upper_date):
+        return "EPC Age Band Matches Year Built"
+
+    if year_built > float(upper_date):
+        return "EPC Age Band is older than Year Built"
+
+    if year_built < float(upper_date):
+        return "EPC Age Band is newer than Year Built"
+
+    raise Exception("Should not reach here")
+
+
 def app():
     """
     This app is EPC pulling data for some properties owned by Livewest
@@ -179,17 +229,47 @@ def app():
     Heat loss calculations
     EPC recommendations
     Property UPRN
-
     """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People"
-    DATA_FILENAME = "Regulated Stock - Do Not Change (06.06.24).xlsx"
-    SHEET_NAME = "Assets 1"
+
+    # TODO:
+    # For cavity work:
+    # - Flag any entries that have a different wall type between non-intrusive data against EPC
+    # - Worth double checking entries that have a difference in wall construction
+    # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity
+    # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation
+    # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
+    # are less than C75
+    # - Flag anything pre SAP2012
+    # - Flag anything over 5 years old
+    # - Look at year built vs age band
+    #
+    # For Solar:
+    # - Discount any that have solar PV - based on non-intrusives and from the inspections team
+    # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with
+    # electric room heaters but it might need to be an EPC E
+    # - Fabric - check the floor, wall and roof:
+    #     - Filled or empty cavity is good
+    #     - Insulated solid/timber/system built is good
+    #     - SCIS/CEG needs solid floors
+    #     - JJC don’t care
+    #     - Anything with a loft 200 or below
+    # - Anything C75 and above won’t qualify
+    # - Insulated loft = 200mm
+    # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
+    # - Or the insulation required is loft/cavity (floors should be solid)
+
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Eastlight"
+    DATA_FILENAME = "Eastlight addresses potential PV data pull required.xlsx"
+    SHEET_NAME = "Sheet1"
     POSTCODE_COLUMN = "Postcode"
-    FULLADDRESS_COLUMN = "Address"
-    ADDRESS1_COLUMN = "AddressLine1"
+    FULLADDRESS_COLUMN = None
+    ADDRESS1_COLUMN = "HouseName"
     ADDRESS1_METHOD = None
-    ADDRESS_COLS_TO_CONCAT = []
+    ADDRESS_COLS_TO_CONCAT = [
+        "HouseName", "Block", "Address1"
+    ]
     MISSING_POSTCODES_METHOD = None
+    PROPERTY_YEAR_BUILT = 'Built In Year'
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
@@ -216,6 +296,7 @@ def app():
         asset_list[col] = asset_list[col].astype(str)
         asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
         asset_list[col] = asset_list[col].str.replace('  ', ' ', regex=False)
+        asset_list[col] = asset_list[col].str.strip()
 
     if ADDRESS1_COLUMN is None:
         ADDRESS1_COLUMN = "address1_extracted"
@@ -226,7 +307,15 @@ def app():
     if FULLADDRESS_COLUMN is None:
         FULLADDRESS_COLUMN = "fulladdress_extracted"
         # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
-        asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
+        # Sometimes, some of the columns are empty, so we need to remove them
+        asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(
+            lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1
+        )
+
+        # We clean up portential non-breaking spaces, and double spaces
+        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str)
+        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False)
+        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('  ', ' ', regex=False)
 
     # We check for duplicated addresses
     asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
@@ -237,8 +326,10 @@ def app():
     asset_list = asset_list.drop(columns=["deduper"])
 
     # We chunk up this data into 5000 rows at a time
+    # Create the chunks directory
+    if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")):
+        os.makedirs(os.path.join(DATA_FOLDER, "Chunks"))
     chunk_size = 5000
-    epc_data = []
     errors = []
     no_epc = []
     skip = None  # Used to skip already completed chunks
@@ -275,9 +366,19 @@ def app():
         # Store the chunk locally as a csv
         pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)
 
-        epc_data.extend(epc_data_chunk)
+    # We read in and concatenate the created created chunks
+    chunks_folder = os.path.join(DATA_FOLDER, "Chunks")
+    # List the contents
+    chunk_files = os.listdir(chunks_folder)
+    epc_data = []
+    for file in chunk_files:
+        csv_data = pd.read_csv(os.path.join(chunks_folder, file))
+        # We need to convert the recommendations back to a list
+        csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
+        csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
+        epc_data.append(csv_data)
 
-    epc_df = pd.DataFrame(epc_data)
+    epc_df = pd.concat(epc_data)
 
     # We expand out the recommendations
     recommendations_df = epc_df[["row_id", "recommendations"]]
@@ -302,9 +403,9 @@ def app():
         transformed_data.append(row_data)
 
     transformed_df = pd.DataFrame(transformed_data)
-    # Drop the column that is ""
-    if "" in transformed_df.columns:
-        transformed_df = transformed_df.drop(columns=[""])
+    # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation
+    # recommendations
+    transformed_df = transformed_df[["row_id", "Cavity wall insulation"]]
 
     # Get the find my epc data
     find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
@@ -342,7 +443,9 @@ def app():
             "energy-consumption-current",  # kwh/m2
             "photo-supply",
         ]
-    ].rename(columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"})
+    ].rename(
+        columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}
+    )
 
     asset_list = asset_list.merge(
         epc_df,
@@ -422,6 +525,138 @@ def app():
         axis=1
     )
 
+    # We produce some additional fields
+    # 1) Is the SAP rating below C75
+    asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75
+    # 2) Flag anything where the EPC is older than 5 years
+    cutoff_year = pd.Timestamp.now().year - 5
+    asset_list[f"EPC is pre {cutoff_year}"] = (
+        pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year
+    )
+
+    # 3) If we have year in the asset list, we flag entries where the built year is different from the
+    # EPC Age band
+    if PROPERTY_YEAR_BUILT is not None:
+        asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
+            lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
+        )
+
+    # 4) Flag properties that look like they're good candidates for solar installs
+    # Firstly, flag if the fabric is completely done
+
+    insulated_wall_substrings = [
+        ", insulated", "with external insulation", "with internal insulation", "filled cavity"
+    ]
+
+    insulated_roof_substrings = [
+        "(another dwelling above)", "limited insulation", "(other premises above)",
+        ", no insulation",
+    ]
+
+    def check_solar_insulation_conditions(x):
+
+        if pd.isnull(x["Wall Construction"]):
+            return None
+
+        if "average thermal transmittance" in x["Wall Construction"].lower():
+            # We extract out the u-values
+            wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"]
+            roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"]
+            floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"]
+
+            roof_uvalue = 0 if roof_uvalue is None else roof_uvalue
+            floor_uvalue = 0 if floor_uvalue is None else floor_uvalue
+
+            # We apply some cutoffs
+            if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7:
+                return "Walls, Roof and Floor have U-values below 0.7"
+
+            return "Confirm U-values"
+
+        walls_insulated = any(
+            insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings
+        )
+        roof_is_numeric = False
+        if str(x["Roof Insulation Thickness"]).isdigit():
+            roof_is_numeric = True
+            roof_insulated = int(x["Roof Insulation Thickness"]) >= 200
+        else:
+            roof_insulated = any(
+                insulated_substring in x["Roof Construction"].lower() for insulated_substring in
+                insulated_roof_substrings
+            )
+
+        floor_is_solid = "solid" in x["Floor Construction"].lower()
+
+        if walls_insulated and roof_insulated and floor_is_solid:
+            return "Walls Insulated, Roof Insulated, Floor Solid"
+
+        if walls_insulated and floor_is_solid and roof_is_numeric:
+            return "Walls Insulated, Floor Solid, Loft need top-up"
+
+        return "Not Fully Insulated or no data"
+
+    asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1)
+
+    asset_list["Good Solar Candidate"] = (
+        asset_list["SAP Rating is 75 and below"] &
+        ~asset_list["Has Solar PV"] &
+        (
+            asset_list["Heating Type"].isin(
+                [
+                    "Electric storage heaters",
+                    "Room heaters, electric",
+                ]
+            ) | asset_list["Heating Type"].str.contains("heat pump", case=False)
+        ) & (
+            asset_list["Solar Fabric Condition"].isin(
+                [
+                    "Walls Insulated, Roof Insulated, Floor Solid",
+                    "Walls, Roof and Floor have U-values below 0.7",
+                    "Walls Insulated, Floor Solid, Loft need top-up"
+                ]
+            )
+        )
+    )
+
+    def flat_analysis(asset_list):
+
+        # We need to deduce the building name - we strip out the house number
+        def extract_building_name(x):
+            # TODO: This doesn't really work
+            if pd.isnull(x):
+                return None
+            house_no = SearchEpc.get_house_number(address=x, postcode=None)
+            if house_no:
+                return x.replace(house_no, "").strip()
+            return x.split(",")[0].strip()
+
+        # We want to deduce if flats have 50% of the properties below C75
+        # We group by postcode and property type
+        grouped = asset_list.groupby(["Postcode", "Property Type"])
+
+        flat_data = []
+        for _, group in grouped:
+            if "flat" in group["Property Type"].str.lower().values:
+                num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0)
+                num_below_c75 = group["SAP score on register"].lt(75).sum()
+
+                flat_data.append(
+                    {
+                        "Postcode": group["Postcode"].iloc[0],
+                        "Property Type": "Flat",
+                        "Number of Flats with EPC": num_flats,
+                        "Number of Flats below C75": num_below_c75,
+                        "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats)
+                    }
+                )
+
+        flat_data = pd.DataFrame(flat_data)
+
+        return flat_data
+
+    flat_data = flat_analysis(asset_list)
+
     # For all of the columns in transformed_df, prefix with "Recommendation: "
     for col in transformed_df.columns:
         if col == "row_id":
@@ -436,54 +671,13 @@ def app():
     asset_list = asset_list.drop(columns=["row_id", "index"])
 
     # Store as an excel
-    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx"
-    asset_list.to_excel(filename, index=False)
+    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
+    # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
+
+    with pd.ExcelWriter(filename) as writer:
+        asset_list.to_excel(writer, sheet_name="EPC Data", index=False)
+        flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
 
     matches_review = asset_list[
         [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
     ]
-
-
-import requests
-import base64
-
-API_KEY = "c4afe10370d67eeaa44f067dd37d115263f6c90e"
-URL = "https://epc.opendatacommunities.org/api/v1/domestic/search?size=20"
-email = "itskruel@gmail.com"
-
-AUTH_TOKEN = base64.b64encode(
-    ":".join([email, API_KEY]).encode("utf-8")
-)
-
-AUTH_TOKEN = "aXRza3J1ZWxAZ21haWwuY29tOmM0YWZlMTAzNzBkNjdlZWFhNDRmMDY3ZGQzN2QxMTUyNjNmNmM5MGU="
-
-headers = {
-    "Authorization": "Basic {auth_token}".format(auth_token=AUTH_TOKEN),
-    "Accept": "application/json",
-}
-
-params = {
-    "UPRN": "766024370"
-}
-
-response = requests.get(url="https://epc.opendatacommunities.org/api/v1/domestic/search?size=20&UPRN=766024370",
-                        headers=headers)
-response.json()
-
-data = response.json()
-
-from operator import itemgetter
-
-newest = sorted(data["rows"], key=itemgetter('lodgement-date'))
-data["rows"][0]["lodgement-date"]
-data["rows"][1]["lodgement-date"]
-
-import pandas as pd
-
-df = pd.DataFrame(data["rows"])
-
-df["uprn"].values[2]
-
-df[df["uprn"] == "3455035000"]["property-type"]
-
-from backend.apis.GoogleSolarApi import GoogleSolarApi

From 959d29b675a6b8e6c57074d5a9fe5a3973ed1d96 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 12 Feb 2025 15:20:55 +0000
Subject: [PATCH 23/72] allowing optional ashp cop parameter

---
 backend/app/plan/router.py         |  5 +++--
 backend/app/plan/schemas.py        |  2 ++
 etl/customers/l_and_g/ic_slides.py |  5 ++++-
 recommendations/Recommendations.py | 25 +++++++++++++++++++------
 4 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 04a2ef7f..f85ceacc 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -370,7 +370,7 @@ def extract_property_request_data(
         property_non_invasive_recommendations["recommendations"] = str(transformed)
 
     # Check if the valuation data has uprn
-    valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else True
+    valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else False
     if valuation_has_uprn:
         valuation_has_uprn = valuation_data[0]["uprn"] not in ["", None]
 
@@ -692,7 +692,8 @@ async def trigger_plan(body: PlanTriggerRequest):
                 Recommendations.calculate_recommendation_tenant_savings(
                     property_instance=property_instance,
                     kwh_simulation_predictions=kwh_simulation_predictions,
-                    property_recommendations=property_recommendations
+                    property_recommendations=property_recommendations,
+                    ashp_cop=body.ashp_cop
                 )
             )
             property_instance.current_energy_bill = property_current_energy_bill
diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py
index f84912fe..618bec90 100644
--- a/backend/app/plan/schemas.py
+++ b/backend/app/plan/schemas.py
@@ -80,3 +80,5 @@ class PlanTriggerRequest(BaseModel):
     multi_plan: Optional[bool] = False
     optimise: Optional[bool] = True
     default_u_values: Optional[bool] = True
+
+    ashp_cop: Optional[float] = 2.8
diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py
index 72dfc2c0..a5cb3511 100644
--- a/etl/customers/l_and_g/ic_slides.py
+++ b/etl/customers/l_and_g/ic_slides.py
@@ -132,7 +132,7 @@ def get_data(portfolio_id, scenario_ids):
     return properties_data, plans_data, recommendations_data
 
 
-properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[199])
+properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[205])
 
 properties_df = pd.DataFrame(properties_data)
 plans_df = pd.DataFrame(plans_data)
@@ -240,4 +240,7 @@ df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"]
 df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round()
 df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x))
 
+df["Recommendation: Air Source Heat Pump"].sum()
+df["Cost: Air Source Heat Pump"].sum()
+
 df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon Data Export - 2.csv", index=False)
diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py
index 03e651e8..42f4e783 100644
--- a/recommendations/Recommendations.py
+++ b/recommendations/Recommendations.py
@@ -649,7 +649,9 @@ class Recommendations:
         return property_recommendations, impact_summary
 
     @staticmethod
-    def map_descriptions_to_fuel(heating_description, hotwater_description, main_fuel_description):
+    def map_descriptions_to_fuel(
+        heating_description, hotwater_description, main_fuel_description, descriptions_to_fuel_types
+    ):
 
         # Handle the case of community schemes
         if (heating_description == "Community scheme") or (hotwater_description == "Community scheme"):
@@ -662,7 +664,7 @@ class Recommendations:
                 }
             raise NotImplementedError("Handle this case")
 
-        mapped = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[heating_description]
+        mapped = descriptions_to_fuel_types[heating_description]
         heating_fuel = mapped["fuel"]
 
         if hotwater_description in [
@@ -682,7 +684,7 @@ class Recommendations:
                 "heating_cop": mapped["cop"], "hotwater_cop": 1
             }
 
-        mapped_hotwater = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[hotwater_description]
+        mapped_hotwater = descriptions_to_fuel_types[hotwater_description]
 
         return {
             "heating_fuel_type": heating_fuel, "hotwater_fuel_type": mapped_hotwater["fuel"],
@@ -691,7 +693,7 @@ class Recommendations:
 
     @classmethod
     def calculate_recommendation_tenant_savings(
-        cls, property_instance, kwh_simulation_predictions, property_recommendations
+        cls, property_instance, kwh_simulation_predictions, property_recommendations, ashp_cop=None
     ):
         """
         This method inserts the kwh savings and the bill savings that the customer will make from the recommendations
@@ -703,9 +705,12 @@ class Recommendations:
         :param property_instance: Instance of the Property class, for the home associated to property_id
         :param kwh_simulation_predictions: dictionary of predictions from the model apis
         :param property_recommendations: dictionary of recommendations for the property
+        :param ashp_cop: The coefficient of performance for the air source heat pump.
         :return:
         """
 
+        ashp_cop = ashp_cop if ashp_cop else assumptions.AVERAGE_ASHP_EFFICIENCY
+
         kwh_impact_table = kwh_simulation_predictions["heating_kwh_predictions"][
             kwh_simulation_predictions["heating_kwh_predictions"]["property_id"] == str(property_instance.id)
             ].merge(
@@ -774,12 +779,19 @@ class Recommendations:
                     if kwh_impact_table.loc[i, col] > previous_phase[col].max():
                         kwh_impact_table.loc[i, col] = previous_phase[col].max()
 
+        descriptions_to_fuel_types = assumptions.DESCRIPTIONS_TO_FUEL_TYPES
+        # We will the air source heat pump efficiencies
+        ashp_keys = [k for k in descriptions_to_fuel_types.keys() if "air source heat pump" in k.lower()]
+        for k in ashp_keys:
+            descriptions_to_fuel_types[k]["cop"] = ashp_cop
+
         # For heating system recommendations, this could result in a fuel type change so we reflect that
         fuel_mapping = pd.DataFrame([
             {
                 "id": epc["id"],
                 **cls.map_descriptions_to_fuel(
-                    epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"]
+                    epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"],
+                    descriptions_to_fuel_types
                 )
             } for epc in property_instance.updated_simulation_epcs
         ])
@@ -793,7 +805,8 @@ class Recommendations:
                             **cls.map_descriptions_to_fuel(
                                 property_instance.data["mainheat-description"],
                                 property_instance.data["hotwater-description"],
-                                property_instance.data["main-fuel"]
+                                property_instance.data["main-fuel"],
+                                descriptions_to_fuel_types
                             )
                         }
                     ]

From 6396f081c15a56dcb799db1edd64edbb89c56921 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 12 Feb 2025 16:19:52 +0000
Subject: [PATCH 24/72] stonewater extracting age

---
 .idea/Model.iml                                | 2 +-
 .idea/misc.xml                                 | 2 +-
 etl/customers/stonewater/Wave 3 Preparation.py | 7 ++++++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index df6c4faa..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 50cad4ca..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index b2a92e4c..24a8e9bb 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -125,6 +125,7 @@ def extract_summary_report(pdf_path):
     - Address
     """
 
+    blah
     data = {
         "Address": None,
         "Postcode": None,
@@ -701,6 +702,7 @@ def extract_epr(pdf_path):
         "Primary Energy Use (kWh/yr)": None,
         "Primary Energy Use Intensity (kWh/m2/yr)": None,
         "Number of Storeys": None,
+        "Main Building Age Band": None,
         "Fuel Bill": None,
         "Window Age Description": None,
         "Window Age Description Proportion (%)": None,
@@ -779,6 +781,10 @@ def extract_epr(pdf_path):
             floor_area = re.search(r"Total Floor Area\s(?P<floor_area>\d+)\s?m2", text).group("floor_area")
             data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area)
 
+        # Extract age band
+        age_band_match = re.search(r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4})", text)
+        data["Main Building Age Band"] = age_band_match.group(1)
+
         # Extract Number of Storeys
         storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
         data["Number of Storeys"] = int(storeys_match.group(1))
@@ -3022,7 +3028,6 @@ def revised_model():
     # We now do a large pull of all of the data
     extracted_data = []
     for survey_folder in tqdm(survey_folders):
-
         survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
 
         # Check that the survey folder is actually a folder

From 84d4070b490a04d0cf4fdefc20ab4aaaab1d7d05 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 12 Feb 2025 17:10:21 +0000
Subject: [PATCH 25/72] extracting from ima

---
 .../stonewater/Wave 3 Preparation.py          | 61 ++++++++++++++++++-
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 24a8e9bb..e471211c 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -125,13 +125,13 @@ def extract_summary_report(pdf_path):
     - Address
     """
 
-    blah
     data = {
         "Address": None,
         "Postcode": None,
         "Current SAP Rating": None,
         "Current EPC Band": None,
         "Fuel Bill": None,
+        "Main Building Age Band": None,
         "Number of Storeys": None,
         "Window Age Description": None,
         "Window Age Description Proportion (%)": None,
@@ -181,6 +181,10 @@ def extract_summary_report(pdf_path):
         sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
         data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
 
+        # Extract age
+        age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text)
+        data["Main Building Age Band"] = age_band_match.group(1)
+
         # Number of storeys
         storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
         data["Number of Storeys"] = int(storeys_match.group(1))
@@ -3027,6 +3031,7 @@ def revised_model():
 
     # We now do a large pull of all of the data
     extracted_data = []
+    mtp_extracted_data = []  # Additional data to extract from the medium term plans
     for survey_folder in tqdm(survey_folders):
         survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
 
@@ -3048,6 +3053,58 @@ def revised_model():
             None
         )
 
+        mtp_folder = next(
+            (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()),
+            None
+        )
+        if mtp_folder:
+            # We have a mid term plan:
+            mtp_folder_path = os.path.join(survey_folder_path, mtp_folder)
+            # Get the contents - files and not folder
+            mtp_contents = [
+                os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path)
+                if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file))
+            ]
+            # We check the the IMA
+            for file_name in mtp_contents:
+                filepath = os.path.join(survey_folder_path, file_name)
+                # We expect a pdf so try and parse it
+                try:
+                    with open(filepath, "rb") as file:
+                        reader = PyPDF2.PdfReader(file)
+                        # Just the first page
+                        text = reader.pages[0].extract_text()
+
+                except Exception as e:
+                    continue
+
+                # We check if this is an IMA
+                ima_heading_search = re.search(
+                    r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text
+                )
+
+                is_ima = bool(ima_heading_search)
+                if not is_ima:
+                    continue
+
+                # Otherwise, extract: RIR, PV
+                pv_search = re.search(r"PV \(\d+Kwp\)", text)
+                has_pv = bool(pv_search)
+                pv_system = pv_search.group(0) if has_pv else None
+
+                rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text)
+                has_rir = bool(rir_search)
+                rir_spec = rir_search.group(0) if has_rir else None
+
+                mtp_extracted_data.append({
+                    "survey_folder": survey_folder,
+                    "has_pv": has_pv,
+                    "PV System": pv_system,
+                    "RIR Specification": rir_spec,
+                    "has_rir": has_rir
+                })
+                continue
+
         # If retrofit assessment folder exists, check if it has content
         if retrofit_folder or ra_folder:
             if retrofit_folder:
@@ -3094,7 +3151,7 @@ def revised_model():
     retrofit_assessment_data = pd.DataFrame(extracted_data)
 
     # retrofit_assessment_data.to_csv(
-    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), index=False
+    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False
     # )
     retrofit_assessment_data = pd.read_csv(
         os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"),

From 711db3f552e958128faeb49a22073e5461dbc4f6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 13 Feb 2025 07:59:12 +0000
Subject: [PATCH 26/72] adding v1 extraction to stonewater

---
 .../stonewater/Wave 3 Preparation.py          | 53 +++++++++++++++++--
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index e471211c..12158671 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -182,7 +182,10 @@ def extract_summary_report(pdf_path):
         data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
 
         # Extract age
-        age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text)
+        age_band_match = re.search(
+            r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)",
+            text
+        )
         data["Main Building Age Band"] = age_band_match.group(1)
 
         # Number of storeys
@@ -786,7 +789,11 @@ def extract_epr(pdf_path):
             data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area)
 
         # Extract age band
-        age_band_match = re.search(r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4})", text)
+        age_band_match = re.search(
+            r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)",
+            text
+        )
+
         data["Main Building Age Band"] = age_band_match.group(1)
 
         # Extract Number of Storeys
@@ -3065,8 +3072,21 @@ def revised_model():
                 os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path)
                 if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file))
             ]
+
+            has_v1 = [
+                f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower()
+            ]
+
+            if has_v1:
+                # Then we go one level deeper
+                mtp_contents = [
+                    os.path.join(has_v1[0], f) for f in
+                    os.listdir(os.path.join(survey_folder_path, has_v1[0]))
+                ]
+
             # We check the the IMA
             for file_name in mtp_contents:
+
                 filepath = os.path.join(survey_folder_path, file_name)
                 # We expect a pdf so try and parse it
                 try:
@@ -3092,6 +3112,12 @@ def revised_model():
                 has_pv = bool(pv_search)
                 pv_system = pv_search.group(0) if has_pv else None
 
+                # We perform a second search for PV:
+                if pv_search is None:
+                    pv_search = re.search("solar pv", text.lower())
+                    has_pv = bool(pv_search)
+                    pv_system = "Solar PV" if has_pv else None
+
                 rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text)
                 has_rir = bool(rir_search)
                 rir_spec = rir_search.group(0) if has_rir else None
@@ -3149,12 +3175,20 @@ def revised_model():
             extracted_data.append(summary_data)
 
     retrofit_assessment_data = pd.DataFrame(extracted_data)
+    mtp_df = pd.DataFrame(mtp_extracted_data)
 
+    # Save
     # retrofit_assessment_data.to_csv(
     #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False
     # )
+    # mtp_df.to_csv(
+    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), index=False
+    # )
     retrofit_assessment_data = pd.read_csv(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"),
+        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"),
+    )
+    mtp_df = pd.read_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"),
     )
 
     # Remove some definite duplicates
@@ -3164,6 +3198,9 @@ def revised_model():
     # Get all of the folders that end with ROSS
     to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist()
 
+    # Replace \n with ""
+    retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")
+
     retrofit_assessment_data = retrofit_assessment_data[
         ~retrofit_assessment_data["survey_folder"].isin(
             [
@@ -3173,8 +3210,6 @@ def revised_model():
             ] + to_drop
         )
     ]
-    # Replace \n with ""
-    retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")
 
     retrofit_assessments_data_columns = [
         'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)',
@@ -3685,9 +3720,17 @@ def revised_model():
     if not missed_asset_id.empty:
         raise Exception("Missing Asset ID")
 
+    # We merge the mpt data on to the wates coordination
+    wates_coordination = wates_coordination.merge(
+        mtp_df, how="left", on="survey_folder"
+    )
+
     ccs_coordination = ccs_coordination.merge(
         ccs_matching_lookup, how="left", on="Name"
     )
+    ccs_coordination = ccs_coordination.merge(
+        mtp_df, how="left", on="survey_folder"
+    )
 
     retrofit_packages_board = retrofit_packages_board.merge(
         matching_lookup, how="left", on="Name"

From b8a094106c7a8ff7260648ba18d8d48b8f8715e1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 13 Feb 2025 17:28:47 +0000
Subject: [PATCH 27/72] updating stonewater

---
 .idea/Model.iml                               |  2 +-
 .idea/misc.xml                                |  2 +-
 etl/customers/remote_assessments/app.py       | 12 ++--
 .../stonewater/Wave 3 Preparation.py          | 72 ++++++++++++-------
 etl/customers/stonewater/data_cleaning.py     | 59 ++++++++-------
 5 files changed, 89 insertions(+), 58 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 762580d9..df6c4faa 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index c916a158..50cad4ca 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index f32dcea6..70ceb76d 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
 from utils.s3 import save_csv_to_s3
 from etl.find_my_epc.AssetListEpcData import AssetListEpcData
 
-PORTFOLIO_ID = 127
+PORTFOLIO_ID = 128
 USER_ID = 8
 
 load_dotenv(dotenv_path="backend/.env")
@@ -19,9 +19,9 @@ def app():
 
     asset_list = [
         {
-            "address": "19 Hillcrest Court",
-            "postcode": "IP21 4YJ",
-            "uprn": 2630134524,
+            "address": "46",
+            "postcode": "BS6 7BD",
+            "uprn": 61091,
         }
     ]
     asset_list = pd.DataFrame(asset_list)
@@ -52,8 +52,8 @@ def app():
 
     valuation_data = [
         {
-            "uprn": 2630134524,
-            "valuation": 96_000
+            "uprn": 61091,
+            "valuation": 897_000
         }
     ]
     # Store valuation data to s3
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 12158671..94904aae 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3028,11 +3028,12 @@ def revised_model():
         "10. Little Island",
         "11. CCS Dorset"
     ]
+    wave_21_folder_name = "Wave 2.1 Surveys - 2"
 
     for wave_2_1_folder in wave_21_folders:
-        folder_path = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 2.1 Surveys", wave_2_1_folder)
+        folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder)
         if os.path.isdir(folder_path):  # Check if folder exists
-            folder_contents = [os.path.join("Wave 2.1 Surveys", wave_2_1_folder, file) for file in
+            folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in
                                os.listdir(folder_path)]
             survey_folders.extend(folder_contents)  # Append contents to the master list
 
@@ -3179,18 +3180,32 @@ def revised_model():
 
     # Save
     # retrofit_assessment_data.to_csv(
-    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False
+    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False
     # )
     # mtp_df.to_csv(
-    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), index=False
+    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False
     # )
     retrofit_assessment_data = pd.read_csv(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"),
+        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"),
     )
     mtp_df = pd.read_csv(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"),
+        os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"),
     )
 
+    # There are a few duplicates we just manually drop
+    mtp_df = mtp_df.drop_duplicates()
+    mtp_df = mtp_df[
+        ~((
+              mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27"
+          ) & (~mtp_df["has_pv"]))
+    ]
+
+    mtp_df = mtp_df[
+        ~((
+              mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5"
+          ) & (~mtp_df["has_pv"]))
+    ]
+
     # Remove some definite duplicates
     dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"]
     dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)]
@@ -3487,7 +3502,7 @@ def revised_model():
     ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"]
 
     ccs_manual_filters = {
-        "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35"
+        "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35"
     }
     ccs_matching_lookup = []
     for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)):
@@ -3583,13 +3598,13 @@ def revised_model():
     ]
 
     wates_manual_filters = {
-        "24 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/24-25 Rabley Wood View",
-        "14 Edencroft": "Wave 2.1 Surveys/3. Wiltshire/14 Edencroft",
-        "Flat 31 Rabley Wood View": "Wave 2.1 Surveys/3. Wiltshire/Flat 31  Rabley Wood View",
-        'Flat 13, Manor Fields': 'Wave 2.1 Surveys/1. Herefordshire/(038) Manor Fields Flat 13',
-        "4 Kittys Lane": "Wave 2.1 Surveys/1. Herefordshire/(005) Kittys Lane 4",
-        '1 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 1',
-        '2 Jephson Court': 'Wave 2.1 Surveys/5. Coventry/Jesphson Court 2',
+        "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View",
+        "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft",
+        "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31  Rabley Wood View",
+        'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13',
+        "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4",
+        '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1',
+        '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2',
     }
     wates_matching_lookup = []
     # Examples to skip when we cannot get the data
@@ -3720,6 +3735,9 @@ def revised_model():
     if not missed_asset_id.empty:
         raise Exception("Missing Asset ID")
 
+    if wates_coordination["Asset ID_x"].duplicated().sum():
+        raise Exception("Duplicated IDs in wates")
+
     # We merge the mpt data on to the wates coordination
     wates_coordination = wates_coordination.merge(
         mtp_df, how="left", on="survey_folder"
@@ -3839,29 +3857,31 @@ def revised_model():
 
     def find_nearest_matching_property(coordinated_packages, home):
         filter_levels = [
-            ["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"],
-            ["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"],
-            ["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"],
-            ["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"],
-            ["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"],
-            ["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"],
+            (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1),
+            (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
+            (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
+            (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4),
+            (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5),
+            (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6),
         ]
 
-        for i, filters in enumerate(filter_levels):
+        max_confidence = max([confidence for (_, confidence) in filter_levels])
+
+        for i, (filters, match_confidence) in enumerate(filter_levels):
             match = coordinated_packages.copy()
 
             for col in filters:
                 match = match[match[col] == home[col]]
 
             if not match.empty:
-                return match
+                return match, match_confidence
 
         # Finally, we search for a property in the same Archetype
         match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]]
         if not match.empty:
-            return match
+            return match, max_confidence + 1
 
-        return None  # No match found
+        return None, None  # No match found
 
     coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip()
     new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip()
@@ -3896,8 +3916,8 @@ def revised_model():
             ]
             matches.extend(to_extend)
             continue
-
-        closest_match = find_nearest_matching_property(coordinated_packages, home)
+        blah
+        closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home)
         if closest_match is None:
             no_match.append(home["Organisation Reference"])
             continue
diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py
index 010902ce..a5da0c79 100644
--- a/etl/customers/stonewater/data_cleaning.py
+++ b/etl/customers/stonewater/data_cleaning.py
@@ -86,8 +86,14 @@ def download_data_from_sharepoint():
         folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders"
     )
 
+    folders_to_keep = [
+        "1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth",
+        "5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire",
+        "9. Guildford", "10. Little Island", "11. CCS Dorset",
+    ]
+
     folders_to_pull = [
-        folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"]
+        folder for folder in contents["value"] if folder["name"] in folders_to_keep
     ]
     for folder_to_pull in folders_to_pull:
         # Get the contents
@@ -109,35 +115,40 @@ def download_data_from_sharepoint():
             )
             if not property_folder_contents.get("value"):
                 continue
-            # We look for the retrofit assessment folder:
+            # We look for the retrofit assessment folder or mtp folders:
             property_sub_folders = [
-                f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower()
+                f for f in property_folder_contents["value"] if
+                "ra coordinator info" in f["name"].lower() or
+                "retrofit assessment" in f["name"].lower() or
+                "ra info" in f["name"].lower() or
+                "mtp" in f["name"].lower() or
+                "mid-term" in f["name"].lower()
             ]
 
             if not property_sub_folders:
                 continue
 
-            # if we have this, we download the folder and store it on my laptop!
-            property_sub_folder = property_sub_folders[0]
+            for property_sub_folder in property_sub_folders:
+                # if we have this, we download the folder and store it on my laptop!
 
-            property_folder_path = os.path.join(
-                "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders",
-                folder_to_pull["name"],
-                property_folder["name"],
-                property_sub_folder["name"]
-            )
+                property_folder_path = os.path.join(
+                    "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders",
+                    folder_to_pull["name"],
+                    property_folder["name"],
+                    property_sub_folder["name"]
+                )
 
-            download_dir = os.path.join(
-                "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys",
-                folder_to_pull["name"],
-                property_folder["name"],
-                property_sub_folder["name"]
-            )
+                download_dir = os.path.join(
+                    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2",
+                    folder_to_pull["name"],
+                    property_folder["name"],
+                    property_sub_folder["name"]
+                )
 
-            # We download the folder
-            sharepoint_client.download_sharepoint_folder(
-                drive_id=sharepoint_client.document_drive["id"],
-                folder_path=property_folder_path,
-                download_dir=download_dir,
-                excluded_file_types=["MOV", "jpg"]
-            )
+                # We download the folder
+                sharepoint_client.download_sharepoint_folder(
+                    drive_id=sharepoint_client.document_drive["id"],
+                    folder_path=property_folder_path,
+                    download_dir=download_dir,
+                    excluded_file_types=["MOV", "jpg"]
+                )

From bd131a2f663056fb46a906d8f148b2bcc06cd871 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 13 Feb 2025 22:32:31 +0000
Subject: [PATCH 28/72] preparing outputs for stonewater

---
 .../stonewater/Wave 3 Preparation.py          | 77 +++++++++++++++----
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 94904aae..50dadcaf 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2984,6 +2984,8 @@ def revised_model():
     original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
     original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str)
 
+    wave_21_folder_name = "Wave 2.1 Surveys - 2"
+
     # Check if we have all of the addresses
     missed = original_archetypes[
         ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values)
@@ -3028,7 +3030,6 @@ def revised_model():
         "10. Little Island",
         "11. CCS Dorset"
     ]
-    wave_21_folder_name = "Wave 2.1 Surveys - 2"
 
     for wave_2_1_folder in wave_21_folders:
         folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder)
@@ -3252,7 +3253,9 @@ def revised_model():
         'Main Wall Thickness', 'Main Building Alternative Wall Type',
         'Main Building Alternative Wall Insulation',
         'Main Building Alternative Wall Dry-lining',
-        'Main Building Alternative Wall Thickness', 'Main Fuel'
+        'Main Building Alternative Wall Thickness',
+        'Main Fuel',
+        'Main Building Age Band',
     ]
     # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey:
     retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns]
@@ -3795,7 +3798,8 @@ def revised_model():
                     "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
                     'SAP Band Install Package', 'Package Approved (Client)',
                     'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
-                    'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y",
+                    'Ventilation', 'Heating', 'Other Measures', 'PV System',
+                    "Asset ID.1_y",
                 ] + retrofit_assessments_data_columns_prefixed
                 ].rename(
                 columns={
@@ -3811,6 +3815,7 @@ def revised_model():
                     'Heating': 'Main Heating',
                     'Other Measures': 'Other measures',
                     'Asset ID.1_y': 'Organisation Reference',
+                    "PV System": "Solar PV",
                 }
             ),
             wates_coordination[
@@ -3818,8 +3823,7 @@ def revised_model():
                     "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
                     'SAP Band Install Package', 'Package Approved (Client)',
                     'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
-                    'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x'
-
+                    'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System"
                 ] + retrofit_assessments_data_columns_prefixed
                 ].rename(
                 columns={
@@ -3835,6 +3839,7 @@ def revised_model():
                     'Heating': 'Main Heating',
                     'Other Measures': 'Other measures',
                     'Asset ID_x': 'Organisation Reference',
+                    "PV System": "Solar PV",
                 }
             )
         ]
@@ -3857,12 +3862,12 @@ def revised_model():
 
     def find_nearest_matching_property(coordinated_packages, home):
         filter_levels = [
-            (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1),
-            (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
-            (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
-            (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4),
-            (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5),
-            (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6),
+            (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
+            (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
+            (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4),
+            (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5),
+            (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6),
+            (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7),
         ]
 
         max_confidence = max([confidence for (_, confidence) in filter_levels])
@@ -3911,12 +3916,13 @@ def revised_model():
                 {
                     "Organisation Reference": home["Organisation Reference"],
                     "Best Match Organisation Reference": m,
+                    "match_confidence": 1,
                     "Was Surveyed": True
                 } for m in survey_result["Organisation Reference"].values
             ]
             matches.extend(to_extend)
             continue
-        blah
+
         closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home)
         if closest_match is None:
             no_match.append(home["Organisation Reference"])
@@ -3926,6 +3932,7 @@ def revised_model():
             {
                 "Organisation Reference": home["Organisation Reference"],
                 "Best Match Organisation Reference": m,
+                "match_confidence": match_confidence,
                 "Was Surveyed": False
             } for m in closest_match["Organisation Reference"].values
         ]
@@ -3953,10 +3960,29 @@ def revised_model():
         suffixes=("", " - Closest Match")
     )
 
+    measures_columns = [
+        'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+        'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+        'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+        'Solar PV', 'Other measures'
+    ]
+
     # We want to aggregate the matches, when we have multiple
     aggregated_matches_df = []
     for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
+
+        measures = coordinated_packages[
+            (
+                coordinated_packages["Organisation Reference"].isin(
+                    mapped_matches['Best Match Organisation Reference'].values
+                )
+            )
+        ][measures_columns]
+
         if mapped_matches.shape[0] == 1:
+            # Get the measures for this property
+            measures = measures.squeeze()
+
             aggregated_matches_df.append(
                 {
                     "Organisation Reference": org_ref,
@@ -3965,6 +3991,7 @@ def revised_model():
                     "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0],
                     "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0],
                     "Was Surveyed": mapped_matches["Was Surveyed"].values[0],
+                    **measures
                 }
             )
             continue
@@ -3978,6 +4005,17 @@ def revised_model():
             mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[
                 0] / number_of_matches * 100
         )
+
+        measures_aggregated = {}
+        for m in measures_columns:
+            if any(~pd.isnull(measures[m])):
+                # Check if we have 2 unique values
+                vals = measures[~pd.isnull(measures[m])][m].unique()
+                if len(vals) > 1:
+                    measures_aggregated[m] = ", ".join(vals)
+                else:
+                    measures_aggregated[m] = vals[0]
+
         aggregated_matches_df.append(
             {
                 "Organisation Reference": org_ref,
@@ -3985,7 +4023,8 @@ def revised_model():
                 "Proportion": proportion_with_this_epc,
                 "Estimated SAP Rating": average_rating,
                 "Estimated EPC Rating": average_epc_rating,
-                "Was Surveyed": False
+                "Was Surveyed": False,
+                **measures_aggregated
             }
         )
 
@@ -4002,7 +4041,6 @@ def revised_model():
     def remove_leading_zero(address):
         return re.sub(r"^0([1-9]) ", r"\1 ", address)
 
-    # Example usage
     mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero)
     mapped_priority_list["address1"] = np.where(
         mapped_priority_list["Organisation Reference"] == 37004,
@@ -4020,6 +4058,13 @@ def revised_model():
     )
     mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"]
 
+    # Flag where 2 out of the three columns have consensus
+    mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = (
+        (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) |
+        (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) |
+        (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"])
+    )
+
     # Let's get the newest EPC data for these properties
     # We merge on UPRN, when we have it
     # from etl.route_march_data_pull.app import get_data
@@ -4081,6 +4126,7 @@ def revised_model():
             'Survey: Main Building Alternative Wall Dry-lining',
             'Survey: Main Building Alternative Wall Thickness',
             'Survey: Main Fuel',
+            'Survey: Main Building Age Band',
             'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type'
         ]
     ].rename(
@@ -4133,7 +4179,8 @@ def revised_model():
             [
                 "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation',
                 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness',
-                'Survey: Existing Primary Heating System',
+                'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band',
+                'Survey: Main Building Wall Area (m2)',
             ]
         ].rename(
             columns={

From 846cd99631923224d4ba8d776bdeaed35b08884a Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 15 Feb 2025 16:05:57 +0000
Subject: [PATCH 29/72] switch off solar PV if property is listed/heritage or
 in a conservation area

---
 backend/Property.py                           |  5 ++++
 backend/app/plan/router.py                    |  6 ++--
 etl/customers/lambeth/re-knocks.py            | 23 +++++++++++++++
 .../stonewater/Wave 3 Preparation.py          | 28 +++++++++++++------
 etl/route_march_data_pull/app.py              | 22 +++++++--------
 5 files changed, 62 insertions(+), 22 deletions(-)
 create mode 100644 etl/customers/lambeth/re-knocks.py

diff --git a/backend/Property.py b/backend/Property.py
index a495431f..e19970eb 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -395,6 +395,7 @@ class Property:
                     primary_recommendation_id=rec["recommendation_id"],
                     non_invasive_recommendations=self.non_invasive_recommendations,
                 )
+
                 self.recommendations_scoring_data.append(scoring_dict)
 
                 simulation_epc = self.epc_record.prepared_epc.copy()
@@ -1258,6 +1259,10 @@ class Property:
         if (self.building_id is not None) and (self.solar_panel_configuration is not None):
             return True
 
+        # If the property is in a conservation area, don't recommend
+        if self.restricted_measures:
+            return False
+
         is_valid_property_type = self.data["property-type"] in ["House", "Bungalow", "Maisonette"]
         is_valid_roof_type = (
             self.roof["is_flat"] or self.roof["is_pitched"] or self.roof["is_roof_room"]
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index f85ceacc..949c8e4c 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -639,8 +639,10 @@ async def trigger_plan(body: PlanTriggerRequest):
         recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
 
         recommendations_scoring_data = recommendations_scoring_data.drop(
-            columns=["rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
-                     "carbon_ending"]
+            columns=[
+                "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
+                "carbon_ending"
+            ]
         )
 
         all_predictions = await model_api.async_paginated_predictions(
diff --git a/etl/customers/lambeth/re-knocks.py b/etl/customers/lambeth/re-knocks.py
new file mode 100644
index 00000000..1de91b50
--- /dev/null
+++ b/etl/customers/lambeth/re-knocks.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+data = pd.read_excel(
+    "/Users/khalimconn-kowlessar/Downloads/Lambeth Reknocks.xlsx", sheet_name="Possible Route",
+    header=1
+)
+
+data["Outcomes"].value_counts()
+
+# Strip out: No
+
+df = data[data["Outcomes"] == "See notes"]
+notes_df = df[
+    ("Notes (If 'no answer' under outcomes, have you checked around the property for access issues where "
+     "possible?)")].value_counts().to_frame()
+
+example = df[df["Notes (If 'no answer' under outcomes, have you checked around the property for access issues where "
+                "possible?)"] == ('Access to rear of property only through number 10. Overgrown athe rear of property '
+                                  'installer wont be able to access')
+             ]
+
+# 18 did not attend
+#
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 50dadcaf..95fe4fcd 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -4093,7 +4093,9 @@ def revised_model():
             'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
             'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
             'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
-            'Solar PV', 'Other measures', 'Survey: Current SAP Rating', 'Survey: Current EPC Band',
+            'Solar PV', 'Other measures',
+            'Survey: Current SAP Rating',
+            'Survey: Current EPC Band',
             'Survey: Primary Energy Use (kWh/yr)',
             'Survey: Primary Energy Use Intensity (kWh/m2/yr)',
             'Survey: Number of Storeys', 'Survey: Fuel Bill',
@@ -4148,7 +4150,8 @@ def revised_model():
             'Best Match Organisation Reference',
             'Survey: Current EPC Band',
             'Survey: Current SAP Rating',
-            "Was Surveyed"
+            "Was Surveyed",
+            "match_confidence",
         ]
     ].rename(
         columns={
@@ -4157,11 +4160,13 @@ def revised_model():
             'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating"
         }
     ).merge(
-        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]],
+        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type",
+                  "Total Floor Area"]],
         how="left",
         on="Organisation Reference"
     ).merge(
-        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]].rename(
+        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type",
+                  "Total Floor Area"]].rename(
             columns={
                 "Organisation Reference": "Best Match - Organisation Reference",
                 "Walls": "Best Match - Walls",
@@ -4169,7 +4174,8 @@ def revised_model():
                 "Heating": "Best Match - Heating",
                 "Main Fuel": "Best Match - Main Fuel",
                 "Age": "Best Match - Age",
-                "Property Type": "Best Match - Property Type"
+                "Property Type": "Best Match - Property Type",
+                "Total Floor Area": "Best Match - Total Floor Area"
             }
         ),
         how="left",
@@ -4180,7 +4186,8 @@ def revised_model():
                 "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation',
                 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness',
                 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band',
-                'Survey: Main Building Wall Area (m2)',
+                'Survey: Main Building Wall Area (m2)', 'Survey: Total Floor Area (m2)',
+                'Survey: Main Building Age Band',
             ]
         ].rename(
             columns={
@@ -4203,7 +4210,12 @@ def revised_model():
             'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID',
             'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing',
             'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion',
-            'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed"
+            'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed",
+            'Main Wall Insulation',
+            'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof',
+            'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation',
+            'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV',
+            'Other measures', "2 of 3 Data Sources Have Consensus on EPC"
         ]
     ].rename(
         columns={
@@ -4271,7 +4283,7 @@ def revised_model():
     worksheet = worksheet.drop(columns=["Last EPC - uprn"])
 
     # Save to Excel with multiple sheets
-    excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "04022025 Stonewater Priority List.xlsx")
+    excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "13022025 Stonewater Priority List.xlsx")
     with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
         worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True)
         mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True)
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index dba85b3f..1b937b2d 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -258,18 +258,16 @@ def app():
     # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
     # - Or the insulation required is loft/cavity (floors should be solid)
 
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Eastlight"
-    DATA_FILENAME = "Eastlight addresses potential PV data pull required.xlsx"
-    SHEET_NAME = "Sheet1"
-    POSTCODE_COLUMN = "Postcode"
-    FULLADDRESS_COLUMN = None
-    ADDRESS1_COLUMN = "HouseName"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
+    DATA_FILENAME = "Stonewater  All Props for EPC Check 10.02.25.xlsx"
+    SHEET_NAME = "stonewater sap, insta"
+    POSTCODE_COLUMN = "Post Code"
+    FULLADDRESS_COLUMN = "Name"
+    ADDRESS1_COLUMN = "Name"
     ADDRESS1_METHOD = None
-    ADDRESS_COLS_TO_CONCAT = [
-        "HouseName", "Block", "Address1"
-    ]
+    ADDRESS_COLS_TO_CONCAT = []
     MISSING_POSTCODES_METHOD = None
-    PROPERTY_YEAR_BUILT = 'Built In Year'
+    PROPERTY_YEAR_BUILT = None
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
@@ -633,7 +631,7 @@ def app():
 
         # We want to deduce if flats have 50% of the properties below C75
         # We group by postcode and property type
-        grouped = asset_list.groupby(["Postcode", "Property Type"])
+        grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"])
 
         flat_data = []
         for _, group in grouped:
@@ -643,7 +641,7 @@ def app():
 
                 flat_data.append(
                     {
-                        "Postcode": group["Postcode"].iloc[0],
+                        "Postcode": group[POSTCODE_COLUMN].iloc[0],
                         "Property Type": "Flat",
                         "Number of Flats with EPC": num_flats,
                         "Number of Flats below C75": num_below_c75,

From ebed7027ac721353593f089e015a9467ae6fa43e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 15 Feb 2025 22:33:01 +0000
Subject: [PATCH 30/72] adding minimums for the number of SAP points solar PV
 will deliver

---
 backend/Property.py                       |  4 +++-
 recommendations/Recommendations.py        |  7 +++++++
 recommendations/SolarPvRecommendations.py | 21 ++++++++++++++++++++-
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/backend/Property.py b/backend/Property.py
index e19970eb..eaffd54d 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -1259,7 +1259,9 @@ class Property:
         if (self.building_id is not None) and (self.solar_panel_configuration is not None):
             return True
 
-        # If the property is in a conservation area, don't recommend
+        # If the property is in a conservation area, is listed or is a heriage building, solar panels
+        # become a difficult measure to generally get through planning restrictions and so we do not recommend
+        # solar panels
         if self.restricted_measures:
             return False
 
diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py
index 42f4e783..715332a5 100644
--- a/recommendations/Recommendations.py
+++ b/recommendations/Recommendations.py
@@ -623,6 +623,13 @@ class Recommendations:
                     if li_sap_limit is not None:
                         property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit)
 
+                if rec["type"] == "solar_pv":
+                    # We use the SAP points in the recommendation as a minimum
+                    property_phase_impact["sap"] = (
+                        rec["sap_points"] if property_phase_impact["sap"] < rec["sap_points"] else
+                        property_phase_impact["sap"]
+                    )
+
                 # Insert this information into the recommendation.
                 if not rec.get("survey", False):
                     rec["sap_points"] = property_phase_impact["sap"]
diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py
index 95f189d3..a97dbcb3 100644
--- a/recommendations/SolarPvRecommendations.py
+++ b/recommendations/SolarPvRecommendations.py
@@ -14,11 +14,16 @@ class SolarPvRecommendations:
     # This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group
     SOLAR_PANEL_WATTAGE = 400
 
+    # For domestic properties, we don't recommend a solar PV system with wattage outside of these
+    # bounds
     MAX_SYSTEM_WATTAGE = 6000
     MIN_SYSTEM_WATTAGE = 1000
 
+    # the maximum area of root we allow to be covered in solar panels for our recommendations.
     MAX_ROOF_AREA_PERCENTAGE = 0.7
 
+    SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE = 1
+
     def __init__(self, property_instance):
         """
         :param property_instance: Instance of the Property class, for the home associated to property_id
@@ -212,6 +217,20 @@ class SolarPvRecommendations:
             roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / roof_area * 100)
             # We round up to the nearest 5
             roof_coverage_percent = np.ceil(roof_coverage_percent / 5) * 5
+
+            # Typically, we've observed that every 5% of additional roof coverage will result in at least
+            # an additional 1 SAP points (though often 2 points) Given this, we can add a reasonable minimum
+            # for the number of SAP points we might expect. We've observed that for some cases where properties
+            # are hitting the higher SAP scores (e.g. EPC A and above), the model can sometimes under-predict
+            # the number of SAP points. This appears to be due to a relatively small number of properties
+            # actually achieving the upper echelons of EPC rating. This can be the case if we're simulating a
+            # whole house retrofit where the home is getting complete insulation, a heat pump and solar panels.
+            # Because panels are the final recommendation, they are often the measure that takes the home
+            # into the medium to high EPC A ranges and so because of a lack of training data, this means that
+            # we might sometime under-predict. This minimum is intended to try and reduce the negative impact
+            # of this. This minimum is used in Recommendations.calculate_recommendation_impact
+            minimum_sap_points = (roof_coverage_percent / 5) * self.SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE
+
             for has_battery in [False, True]:
                 cost_result = self.costs.solar_pv(
                     has_battery=has_battery,
@@ -240,7 +259,7 @@ class SolarPvRecommendations:
                         "description": description,
                         "starting_u_value": None,
                         "new_u_value": None,
-                        "sap_points": None,
+                        "sap_points": minimum_sap_points,
                         "already_installed": already_installed,
                         **cost_result,
                         # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we

From 89d49690b5c9ca4efb89f3879bb7c414098e5ea2 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 16 Feb 2025 17:02:51 +0000
Subject: [PATCH 31/72] added extraction of windows sap point

---
 etl/customers/remote_assessments/app.py   | 12 ++++++------
 recommendations/WindowsRecommendations.py | 12 ++++++++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index 70ceb76d..cce0f4fb 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
 from utils.s3 import save_csv_to_s3
 from etl.find_my_epc.AssetListEpcData import AssetListEpcData
 
-PORTFOLIO_ID = 128
+PORTFOLIO_ID = 129
 USER_ID = 8
 
 load_dotenv(dotenv_path="backend/.env")
@@ -19,9 +19,9 @@ def app():
 
     asset_list = [
         {
-            "address": "46",
-            "postcode": "BS6 7BD",
-            "uprn": 61091,
+            "address": "19",
+            "postcode": "IP21 4YJ",
+            "uprn": 2630134524,
         }
     ]
     asset_list = pd.DataFrame(asset_list)
@@ -52,8 +52,8 @@ def app():
 
     valuation_data = [
         {
-            "uprn": 61091,
-            "valuation": 897_000
+            "uprn": 2630134524,
+            "valuation": 96_000
         }
     ]
     # Store valuation data to s3
diff --git a/recommendations/WindowsRecommendations.py b/recommendations/WindowsRecommendations.py
index 1f755369..46e56c93 100644
--- a/recommendations/WindowsRecommendations.py
+++ b/recommendations/WindowsRecommendations.py
@@ -215,21 +215,29 @@ class WindowsRecommendations:
             "glazed-type": glazed_type_ending,
         }
 
+        measure_type = "double_glazing" if not is_secondary_glazing else "secondary_glazing"
+
+        non_invasive_recommendation = next(
+            (r for r in self.property.non_invasive_recommendations if r["type"] in ["windows_glazing", measure_type]),
+            {}
+        )
+
         self.recommendation = [
             {
                 "phase": phase,
                 "parts": [],
                 "type": "windows_glazing",
-                "measure_type": "double_glazing" if not is_secondary_glazing else "secondary_glazing",
+                "measure_type": measure_type,
                 "description": description,
                 "starting_u_value": None,
                 "new_u_value": None,
-                "sap_points": None,
+                "sap_points": non_invasive_recommendation.get("sap_points", None),
                 "already_installed": already_installed,
                 **cost_result,
                 "is_secondary_glazing": is_secondary_glazing,
                 "description_simulation": description_simulation,
                 "simulation_config": simulation_config,
+                "survey": non_invasive_recommendation.get("survey", None),
             }
         ]
 

From c09b693922c8c3c8ac55648de2772312f319d487 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 16 Feb 2025 18:25:17 +0000
Subject: [PATCH 32/72] minor tweaks to engine during remote assessments

---
 backend/app/assumptions.py              |  1 +
 backend/app/plan/router.py              |  2 +-
 etl/customers/remote_assessments/app.py | 14 ++++++++------
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py
index 841ec2c1..8d0c05be 100644
--- a/backend/app/assumptions.py
+++ b/backend/app/assumptions.py
@@ -54,4 +54,5 @@ DESCRIPTIONS_TO_FUEL_TYPES = {
     "Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85},
     "Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1},
     "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85},
+    "From main system, no cylinderstat": {"fuel": "Natural Gas", "cop": 0.85},
 }
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 949c8e4c..76c172ee 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -338,7 +338,7 @@ def extract_property_request_data(
 
     # Because we have some non-invasive recommendations that match on address and postcode, but not UPRN
     # we need to check existence of uprn
-    has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else True
+    has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else False
     if has_uprn:
         has_uprn = non_invasive_recommendations[0]["uprn"] not in ["", None]
 
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index cce0f4fb..ad97fd41 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
 from utils.s3 import save_csv_to_s3
 from etl.find_my_epc.AssetListEpcData import AssetListEpcData
 
-PORTFOLIO_ID = 129
+PORTFOLIO_ID = 132
 USER_ID = 8
 
 load_dotenv(dotenv_path="backend/.env")
@@ -19,9 +19,11 @@ def app():
 
     asset_list = [
         {
-            "address": "19",
-            "postcode": "IP21 4YJ",
-            "uprn": 2630134524,
+            "address": "3",
+            "postcode": "BB8 0JF",
+            "uprn": 100010509503,
+            "property_type": "House",
+            "built_form": "End-Terrace",
         }
     ]
     asset_list = pd.DataFrame(asset_list)
@@ -52,8 +54,8 @@ def app():
 
     valuation_data = [
         {
-            "uprn": 2630134524,
-            "valuation": 96_000
+            "uprn": 100010509503,
+            "valuation": 116_000
         }
     ]
     # Store valuation data to s3

From 764dc7901f2e7fc117a4df1053b7d9fe7eb9ad34 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 18 Feb 2025 12:20:04 +0000
Subject: [PATCH 33/72] setting up EPC data extraction process for creation of
 reports

---
 .idea/Model.iml                         |   2 +-
 .idea/misc.xml                          |   2 +-
 etl/customers/remote_assessments/app.py |  14 +--
 etl/route_march_data_pull/app.py        |  16 +--
 survey_report/app.py                    | 152 +++++++++++++++++++++---
 5 files changed, 151 insertions(+), 35 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index df6c4faa..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 50cad4ca..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index ad97fd41..15f59c5e 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
 from utils.s3 import save_csv_to_s3
 from etl.find_my_epc.AssetListEpcData import AssetListEpcData
 
-PORTFOLIO_ID = 132
+PORTFOLIO_ID = 133
 USER_ID = 8
 
 load_dotenv(dotenv_path="backend/.env")
@@ -19,11 +19,9 @@ def app():
 
     asset_list = [
         {
-            "address": "3",
-            "postcode": "BB8 0JF",
-            "uprn": 100010509503,
-            "property_type": "House",
-            "built_form": "End-Terrace",
+            "address": "40",
+            "postcode": "PE4 5BB",
+            "uprn": 100090220519,
         }
     ]
     asset_list = pd.DataFrame(asset_list)
@@ -54,8 +52,8 @@ def app():
 
     valuation_data = [
         {
-            "uprn": 100010509503,
-            "valuation": 116_000
+            "uprn": 100090220519,
+            "valuation": 135_000
         }
     ]
     # Store valuation data to s3
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 1b937b2d..f9cb7cbb 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -258,16 +258,16 @@ def app():
     # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
     # - Or the insulation required is loft/cavity (floors should be solid)
 
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
-    DATA_FILENAME = "Stonewater  All Props for EPC Check 10.02.25.xlsx"
-    SHEET_NAME = "stonewater sap, insta"
-    POSTCODE_COLUMN = "Post Code"
-    FULLADDRESS_COLUMN = "Name"
-    ADDRESS1_COLUMN = "Name"
-    ADDRESS1_METHOD = None
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing"
+    DATA_FILENAME = "Community Housing PV data pull.xlsx"
+    SHEET_NAME = "Community Housing"
+    POSTCODE_COLUMN = "Postcode"
+    FULLADDRESS_COLUMN = "Full Address"
+    ADDRESS1_COLUMN = None
+    ADDRESS1_METHOD = "first_word"
     ADDRESS_COLS_TO_CONCAT = []
     MISSING_POSTCODES_METHOD = None
-    PROPERTY_YEAR_BUILT = None
+    PROPERTY_YEAR_BUILT = "Build_Date"
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
diff --git a/survey_report/app.py b/survey_report/app.py
index be31bd52..774d2a15 100644
--- a/survey_report/app.py
+++ b/survey_report/app.py
@@ -1,4 +1,5 @@
 import os
+import requests
 import PyPDF2
 from string import Template
 
@@ -31,31 +32,135 @@ def generate_html_report(template_path, output_path, data):
     print(f"HTML report generated successfully: {output_path}")
 
 
+class PlacidApi:
+    # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors
+    ERROR_CODES = {
+        400: "Bad request",
+        401: "Unauthorized",
+        404: "Template Not found",
+        422: "Validation error",
+        429: "Rate limit exceeded",
+        500: "Internal server error",
+    }
+
+    def __init__(self, api_key):
+        self.api_key = api_key
+
+        self.headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        }
+
+    def create_pdf(
+        self,
+        template_uuid: str,
+        current_epc_rating: str,
+        current_epc_rating_colour: str,
+        post_retrofit_epc_rating: str,
+        post_retrofit_epc_rating_colour: str,
+    ):
+        url = "https://api.placid.app/api/rest/pdfs"
+
+        body = {
+            "webhook_success": None,
+            "passthrough": None,
+            "pages": [
+                {
+                    "template_uuid": template_uuid,
+                    "layers": {
+                        "current_epc_rating": {
+                            "text": current_epc_rating,
+                            "text_color": current_epc_rating_colour,
+                        },
+                        "post_retrofit_epc_rating": {
+                            "text": post_retrofit_epc_rating,
+                            "text_color": post_retrofit_epc_rating_colour,
+                        }
+                    },
+                },
+            ]
+        }
+
+        response = requests.post(
+            url,
+            headers=self.headers,
+            json=body
+        )
+
+        response_body = response.json()
+        pdf_id = response_body["id"]
+
+    def get_pdf(self, pdf_id: str):
+        """
+        Poll the API every 5 seconds until the PDF is ready
+        """
+        url = f"https://api.placid.app/api/rest/pdfs/{pdf_id}"
+
+        response = requests.get(
+            url,
+            headers=self.headers
+        )
+        response_body = response.json()
+
+        url = response_body["pdf_url"]
+        # Download the PDF form this uurl
+        pdf_download = requests.get(url)
+        with open("output.pdf", "wb") as f:
+            f.write(pdf_download.content)
+
+
 def handle():
     """
     Performs the data extraction process for the survey report
     :return:
     """
 
+    PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa"
+    TEMPLATE_UUID = "hnwqgtumckfbf"
+    placid_api = PlacidApi(PLACID_API_KEY)
+
+    EPC_COLOURS = {
+        "A": "#117d58",
+        "B": "#2da55c",
+        "C": "#8dbd40",
+        "D": "#f7cd14",
+        "E": "#f3a96a",
+        "F": "#ef8026",
+        "G": "#e41e3b",
+    }
+
     folders = [
-        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1",
-        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2",
-        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3",
-        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 4",
-        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 5",
+        {
+            "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 "
+                          "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf",
+            "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS "
+                   "ROAD FLAT 1 PRE EPR PDF.pdf",
+            "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 "
+                            "WILLIS ROAD FLAT 1 POST EPR PDF.pdf"
+        },
+        {
+            "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 "
+                          "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf",
+            "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS "
+                   "ROAD FLAT 2 PRE EPR PDF.pdf",
+            "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 "
+                            "WILLIS ROAD FLAT 2 POST EPR PDF.pdf"
+        },
+        {
+            "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 "
+                          "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf",
+            "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS "
+                   "ROAD FLAT 3 PRE EPR PDF.pdf",
+            "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 "
+                            "WILLIS ROAD FLAT 3 POST EPR PDF.pdf"
+        },
     ]
     data = []
-    for data_folder in folders:
+    for data_config in folders:
 
-        folder_contents = os.listdir(data_folder)
-        # We look for the following files:
-        # Site notes
         file_mapping = {}
-        for file in folder_contents:
-            # Check if it's a pdf file
-            if not file.endswith(".pdf"):
-                continue
-            filepath = os.path.join(data_folder, file)
+        for filename, filepath in data_config.items():
             with (open(filepath, "rb") as f):
                 pdf = PyPDF2.PdfReader(f)
                 first_page = pdf.pages[0].extract_text()
@@ -66,16 +171,27 @@ def handle():
             # Check the report type
             report_type = detect_report_type(first_page)
             if report_type is not None:
-                file_mapping[report_type] = text
+                file_mapping[filename] = text
 
         # This is only set up to work with quido site notes so we must have it
-        site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
+        site_notes_extractor = SiteNotesExtractor(file_mapping["site_notes"])
         site_notes = site_notes_extractor.extract_all()
 
         # We also must have an EPR
-        epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
+        epr_extractor = EPRExtractor(file_mapping["epr"])
         epr = epr_extractor.extract_all()
 
+        scenario_epr = EPRExtractor(file_mapping["scenario_epr"])
+        scenario_epr = scenario_epr.extract_all()
+
+        report_data = {
+            "template_uuid": TEMPLATE_UUID,
+            "current_epc_rating": site_notes["Current EPC Band"],
+            "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]],
+            post_retrofit_epc_rating: str,
+            post_retrofit_epc_rating_colour: str,
+        }
+
         # We now produce the combined data sheet which is the starting figure:
         data_sheet = {**epr, **site_notes}
         del data_sheet['Building Dimensions']
@@ -83,7 +199,9 @@ def handle():
         data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
         data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
         del data_sheet["Total Building Dimensions"]
+
         data.append(data_sheet)
+
     data = pd.DataFrame(data)
 
     # Generate the HTML report

From 0de14c4e286b05ecd881aa05f81f1f6172472589 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 18 Feb 2025 19:49:29 +0000
Subject: [PATCH 34/72] quidos site notes extraction

---
 backend/ml_models/Valuation.py     | 26 ++++++++-
 etl/route_march_data_pull/app.py   | 69 ++++++++++++++++++----
 survey_report/app.py               | 92 ++++++++++++++++++++++-------
 survey_report/extraction/quidos.py | 94 +++++++++++++++++++++++++++++-
 4 files changed, 243 insertions(+), 38 deletions(-)

diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index 720005d3..6d4852b2 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -1,5 +1,4 @@
 import numpy as np
-from scipy.constants import value
 
 
 class PropertyValuation:
@@ -216,6 +215,30 @@ class PropertyValuation:
             cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn)
         )
 
+        current_epc = property_instance.data["current-energy-rating"]
+
+        if not current_value:
+            return {
+                "current_value": 0,
+                "lower_bound_increased_value": 0,
+                "upper_bound_increased_value": 0,
+                "average_increased_value": 0,
+                "average_increase": 0
+            }
+
+        return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost)
+
+    @classmethod
+    def estimate_valuation_improvement(cls, current_value, current_epc, target_epc, total_cost=None):
+        """
+        This function estimates the value of a property based on the current EPC rating and the target EPC rating
+        :param current_value:
+        :param current_epc:
+        :param target_epc:
+        :param total_cost:
+        :return:
+        """
+
         if not current_value:
             return {
                 "current_value": 0,
@@ -225,7 +248,6 @@ class PropertyValuation:
                 "average_increase": 0
             }
 
-        current_epc = property_instance.data["current-energy-rating"]
         # We get the spectrum of ratings between the current and target EPC
         epc_band_range = cls.EPC_BANDS[cls.EPC_BANDS.index(current_epc): cls.EPC_BANDS.index(target_epc) + 1]
 
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index f9cb7cbb..ee6a46d3 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -24,21 +24,24 @@ load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
-def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=False):
+def get_data(
+    asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None,
+    epc_api_only=False
+):
     epc_data = []
     errors = []
     no_epc = []
     for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
         try:
             postcode = home[postcode_column]
-            house_number = home[address1_column].strip()
+            house_number = str(home[address1_column]).strip()
             full_address = home[fulladdress_column].strip()
             house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
             if house_no is None:
                 house_no = house_number
             uprn = manual_uprn_map.get(full_address, None)
-            if uprn is None and home.get("uprn"):
-                uprn = home["uprn"]
+            if uprn is None and home.get(uprn_column):
+                uprn = home[uprn_column]
 
             if pd.isnull(uprn):
                 uprn = None
@@ -149,7 +152,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m
     return epc_data, errors, no_epc
 
 
-def extract_address1(asset_list, full_address_col, method="first_two_words"):
+def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
     if method == "first_two_words":
         asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
         return asset_list
@@ -158,6 +161,13 @@ def extract_address1(asset_list, full_address_col, method="first_two_words"):
         asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
         return asset_list
 
+    if method == "house_number_extraction":
+        asset_list["address1_extracted"] = asset_list.apply(
+            lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
+            axis=1
+        )
+        return asset_list
+
     raise ValueError(f"Method {method} not recognized")
 
 
@@ -258,16 +268,29 @@ def app():
     # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
     # - Or the insulation required is loft/cavity (floors should be solid)
 
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing"
-    DATA_FILENAME = "Community Housing PV data pull.xlsx"
-    SHEET_NAME = "Community Housing"
-    POSTCODE_COLUMN = "Postcode"
+    # For Westward
+    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
+    # DATA_FILENAME = "WESTWARD - completed list..xlsx"
+    # SHEET_NAME = "Sheet1"
+    # POSTCODE_COLUMN = "WFT EDIT Postcode"
+    # FULLADDRESS_COLUMN = "Address"
+    # ADDRESS1_COLUMN = None
+    # ADDRESS1_METHOD = "house_number_extraction"
+    # ADDRESS_COLS_TO_CONCAT = []
+    # MISSING_POSTCODES_METHOD = None
+    # PROPERTY_YEAR_BUILT = "Build date"
+    # UPRN_COLUMN = "UPRN"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+    DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+    SHEET_NAME = "Sheet1"
+    POSTCODE_COLUMN = 'Full Address.1'
     FULLADDRESS_COLUMN = "Full Address"
     ADDRESS1_COLUMN = None
     ADDRESS1_METHOD = "first_word"
     ADDRESS_COLS_TO_CONCAT = []
     MISSING_POSTCODES_METHOD = None
-    PROPERTY_YEAR_BUILT = "Build_Date"
+    PROPERTY_YEAR_BUILT = "Build Date"
+    UPRN_COLUMN = None
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
@@ -299,7 +322,10 @@ def app():
     if ADDRESS1_COLUMN is None:
         ADDRESS1_COLUMN = "address1_extracted"
         asset_list = extract_address1(
-            asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD
+            asset_list=asset_list,
+            full_address_col=FULLADDRESS_COLUMN,
+            postcode_col=POSTCODE_COLUMN,
+            method=ADDRESS1_METHOD
         )
 
     if FULLADDRESS_COLUMN is None:
@@ -315,6 +341,23 @@ def app():
         asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False)
         asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('  ', ' ', regex=False)
 
+    if UPRN_COLUMN is not None:
+        # Check if it's numeric and if so, make sure it's an integer
+        def convert_uprn(x):
+
+            if pd.isnull(x):
+                return x
+
+            # check if numeric
+            if np.isreal(x):
+                return str(int(x))
+
+            if str(x).isdigit():
+                return str(int(x))
+            return x
+
+        asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn)
+
     # We check for duplicated addresses
     asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
     if asset_list["deduper"].duplicated().sum():
@@ -342,7 +385,8 @@ def app():
             fulladdress_column=FULLADDRESS_COLUMN,
             address1_column=ADDRESS1_COLUMN,
             postcode_column=POSTCODE_COLUMN,
-            manual_uprn_map=MANUAL_UPRN_MAP
+            manual_uprn_map=MANUAL_UPRN_MAP,
+            uprn_column=UPRN_COLUMN
         )
 
         # We now retrieve any failed properties
@@ -535,6 +579,7 @@ def app():
     # 3) If we have year in the asset list, we flag entries where the built year is different from the
     # EPC Age band
     if PROPERTY_YEAR_BUILT is not None:
+        raise Exception("THIS WAS WRONG!")
         asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
             lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
         )
diff --git a/survey_report/app.py b/survey_report/app.py
index 774d2a15..f6eddb8d 100644
--- a/survey_report/app.py
+++ b/survey_report/app.py
@@ -32,6 +32,15 @@ def generate_html_report(template_path, output_path, data):
     print(f"HTML report generated successfully: {output_path}")
 
 
+def stringify_number(num: int, rounding: bool = True) -> str:
+    if num < 100000:  # 5 figures or fewer
+        rounded_num = ((num + 99) // 100) * 100 if rounding else num
+        return f"{rounded_num:,}"
+    else:  # More than 5 figures
+        rounded_num = ((num + 999) // 1000) * 1000 if rounding else num
+        return f"{rounded_num // 1000}k"
+
+
 class PlacidApi:
     # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors
     ERROR_CODES = {
@@ -89,7 +98,8 @@ class PlacidApi:
         )
 
         response_body = response.json()
-        pdf_id = response_body["id"]
+
+        return response_body
 
     def get_pdf(self, pdf_id: str):
         """
@@ -106,20 +116,22 @@ class PlacidApi:
         url = response_body["pdf_url"]
         # Download the PDF form this uurl
         pdf_download = requests.get(url)
-        with open("output.pdf", "wb") as f:
+        with open("survey_report/example_data/output.pdf", "wb") as f:
             f.write(pdf_download.content)
 
 
-def handle():
+def handler():
     """
     Performs the data extraction process for the survey report
     :return:
     """
 
     PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa"
-    TEMPLATE_UUID = "hnwqgtumckfbf"
+    TEMPLATE_UUID = "5bst9mh1q9lk9"
     placid_api = PlacidApi(PLACID_API_KEY)
 
+    current_property_value = 250000  # Needs to be an input
+
     EPC_COLOURS = {
         "A": "#117d58",
         "B": "#2da55c",
@@ -136,26 +148,27 @@ def handle():
                           "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf",
             "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS "
                    "ROAD FLAT 1 PRE EPR PDF.pdf",
-            "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 "
-                            "WILLIS ROAD FLAT 1 POST EPR PDF.pdf"
+            "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+                                   "/Flat 1/3 WILLIS ROAD FLAT 1 POST EPR SITE NOTES.pdf"
         },
         {
             "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 "
                           "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf",
             "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS "
                    "ROAD FLAT 2 PRE EPR PDF.pdf",
-            "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 "
-                            "WILLIS ROAD FLAT 2 POST EPR PDF.pdf"
+            "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+                                   "/Flat 2/3 WILLIS ROAD FLAT 2 POST EPR SITE NOTES.pdf"
         },
         {
             "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 "
                           "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf",
             "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS "
                    "ROAD FLAT 3 PRE EPR PDF.pdf",
-            "scenario_epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 "
-                            "WILLIS ROAD FLAT 3 POST EPR PDF.pdf"
+            "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+                                   "/Flat 3/3 WILLIS ROAD FLAT 3 POST EPR SITE NOTES.pdf"
         },
     ]
+
     data = []
     for data_config in folders:
 
@@ -181,26 +194,61 @@ def handle():
         epr_extractor = EPRExtractor(file_mapping["epr"])
         epr = epr_extractor.extract_all()
 
-        scenario_epr = EPRExtractor(file_mapping["scenario_epr"])
-        scenario_epr = scenario_epr.extract_all()
+        # Valuation simulation
+        scenario_site_notes_extractor = SiteNotesExtractor(file_mapping["scenario_site_notes"])
+        scenario_site_notes = scenario_site_notes_extractor.extract_all()
+
+        from backend.ml_models.Valuation import PropertyValuation
+        valuation_uplift = PropertyValuation.estimate_valuation_improvement(
+            current_value=current_property_value,
+            current_epc=site_notes["Current EPC Band"],
+            target_epc=scenario_site_notes["Current EPC Band"],
+        )
+        # TODO - should convert this, when it's more than 5 figures and we should certainly stringify this
+
+        valuation_difference = round(valuation_uplift["average_increased_value"] - current_property_value)
+
+        # Prepare the data for output
+        bill_savings = round(
+            site_notes['Estimated Annual Energy Cost (£)'] - scenario_site_notes['Estimated Annual Energy Cost (£)']
+        )
+
+        carbon_savings = round(
+            site_notes["Current Carbon Emissions (TCO2)"] - scenario_site_notes["Current Carbon Emissions (TCO2)"],
+            2
+        )
+
+        payback_period = None
+        if payback_period is None:
+            raise NotImplementedError("Implement me")
+
+        # We extract the measures from the site notes
 
         report_data = {
-            "template_uuid": TEMPLATE_UUID,
             "current_epc_rating": site_notes["Current EPC Band"],
             "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]],
-            post_retrofit_epc_rating: str,
-            post_retrofit_epc_rating_colour: str,
+            "post_retrofit_epc_rating": scenario_site_notes["Current EPC Band"],
+            "post_retrofit_epc_rating_colour": EPC_COLOURS[scenario_site_notes["Current EPC Band"]],
+            "bill_savings": stringify_number(bill_savings),
+            "valuation_improvement": stringify_number(valuation_difference),
+            "carbon_savings": carbon_savings,
+
         }
 
         # We now produce the combined data sheet which is the starting figure:
-        data_sheet = {**epr, **site_notes}
-        del data_sheet['Building Dimensions']
-        # We unnest the Total Building Dimensions
-        data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
-        data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
-        del data_sheet["Total Building Dimensions"]
+        # data_sheet = {**epr, **site_notes}
+        # del data_sheet['Building Dimensions']
+        # # We unnest the Total Building Dimensions
+        # data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
+        # data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
+        # del data_sheet["Total Building Dimensions"]
 
-        data.append(data_sheet)
+        create_pdf_response = placid_api.create_pdf(
+            template_uuid=TEMPLATE_UUID, **report_data
+        )
+        # {'id': 769832, 'type': 'pdf', 'status': 'queued', 'pdf_url': None, 'transfer_url': None, 'passthrough': None}
+        # Download locally
+        placid_api.get_pdf(create_pdf_response["id"])
 
     data = pd.DataFrame(data)
 
diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py
index 374df084..2e772886 100644
--- a/survey_report/extraction/quidos.py
+++ b/survey_report/extraction/quidos.py
@@ -108,8 +108,98 @@ class SiteNotesExtractor:
         self.extract_carbon_emissions()
         self.extract_bills_estimate()
         self.extract_building_dimensions()
+
+        # Extract specific measures
+        # Primary wall
+        # Secondary wall
+        # Roof
+        # Floor
+        # Heating system
+        # Hot water system
+        # Windows
+        # Doors
+        # Lighting
+        # Ventilation
+        # Solar
+
         return self.data
 
+    def extract_walls(self):
+        """
+        Extracts wall type, insulation, dry-lining, and thickness for each building part,
+        including any alternative wall details within the 7.0 Walls section of the summary PDF text.
+        """
+
+        text = self.text
+        wall_data = []
+
+        # Isolate the 7.0 Walls section
+        wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL)
+        if not wall_section_match:
+            raise ValueError("Failed to locate the walls section in the text.")
+
+        wall_section = wall_section_match.group(1)
+
+        # Define patterns to match walls for each building part
+        wall_pattern = re.compile(
+            r"(?P<section>Main Property(?: Alternative)?|Extension \d+)\s*\n"
+            r"(?:Construction\s*(?P<construction>[^\n]*)\n)?"
+            r"(?:Insulation\s*(?P<insulation>[^\n]*)\n)?"
+            r"(?:Insulation Thickness\(mm\)\s*(?P<insulation_thickness>[^\n]*)\n)?"
+            r"(?:Wall Thickness Measured\?\s*(?P<thickness_measured>[^\n]*)\n)?"
+            r"(?:Wall Thickness\(mm\)\s*(?P<thickness>\d+))?",
+            re.MULTILINE
+        )
+
+        # TODO: We aren't effectively picking up alternative walls
+        # alt_wall_pattern = re.compile(
+        #     r"Alternative Wall Sheltered\s*.*?\n"
+        #     r".*?Construction\s*(?P<alt_construction>[^\n]*)\n"
+        #     r"Insulation\s*(?P<alt_insulation>[^\n]*)\n"
+        #     r"Insulation Thickness\(mm\)\s*(?P<alt_insulation_thickness>[^\n]*)\n"
+        #     r"Wall Thickness Measured\?\s*(?P<alt_thickness_measured>[^\n]*)\n"
+        #     r"Wall Thickness\(mm\)\s*(?P<alt_thickness>\d+)?",
+        #     re.MULTILINE
+        # )
+
+        for match in wall_pattern.finditer(wall_section):
+            building_part = match.group("section")
+            # has_alternative_wall = "Alternative" in building_part
+            building_part = "Main Property" if "Main Property" in building_part else building_part
+
+            wall_entry = {
+                "Building Part": building_part,
+                "Wall Type": match.group("construction") or "Unknown",
+                "Wall Insulation": match.group("insulation") or "Unknown",
+                "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown",
+                "Wall Thickness Measured": match.group("thickness_measured") or "Unknown",
+                "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group(
+                    "thickness").isdigit() else None,
+                "Alternative Wall Type": None,
+                "Alternative Wall Insulation": None,
+                "Alternative Insulation Thickness (mm)": None,
+                "Alternative Wall Thickness Measured": None,
+                "Alternative Wall Thickness (mm)": None,
+            }
+
+            # Check if an alternative wall section exists
+            # if has_alternative_wall:
+            #     alt_match = alt_wall_pattern.search(wall_section, match.end())
+            #     if alt_match:
+            #         wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown"
+            #         wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown"
+            #         wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group(
+            #             "alt_insulation_thickness") or "Unknown"
+            #         wall_entry["Alternative Wall Thickness Measured"] = alt_match.group(
+            #             "alt_thickness_measured") or "Unknown"
+            #         wall_entry["Alternative Wall Thickness (mm)"] = int(
+            #             alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group(
+            #             "alt_thickness").isdigit() else None
+
+            wall_data.append(wall_entry)
+
+        return wall_data
+
 
 class EPRExtractor:
     """
@@ -123,7 +213,7 @@ class EPRExtractor:
         self.text = pdf_text
         self.data = {}
 
-    def extract_heating_data(self):
+    def extract_heating_consumption(self):
         """
         Extracts space heating and water heating values from the report.
         """
@@ -162,5 +252,5 @@ class EPRExtractor:
         Runs all extraction methods and returns a dictionary with extracted data.
         """
         self.extract_address()
-        self.extract_heating_data()
+        self.extract_heating_consumption()
         return self.data

From 55d2df17877d184b3bd9874a6da47cab6d3e6450 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 10:12:22 +0000
Subject: [PATCH 35/72] debygging epc searcher

---
 backend/SearchEpc.py             |  3 +
 etl/route_march_data_pull/app.py | 95 +++++++++++++++++++++++++-------
 2 files changed, 77 insertions(+), 21 deletions(-)

diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index c74a0b1f..e8a9dfaa 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -331,6 +331,9 @@ class SearchEpc:
             if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
         ]
 
+        if data:
+            api_response["msg"] = self.SUCCESS
+
         return api_response["msg"]
 
     def filter_rows(self, rows, property_type=None, address=None):
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index ee6a46d3..57239989 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -4,6 +4,7 @@ from BaseUtility import Definitions
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
+from datetime import datetime
 
 from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
@@ -172,7 +173,10 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t
 
 
 def process_age_band(x, year_built_column):
-    year_built = float(x[year_built_column])
+    if isinstance(x[year_built_column], datetime):
+        year_built = x[year_built_column].year
+    else:
+        year_built = float(x[year_built_column])
 
     if pd.isnull(x["Property Age Band"]) or (
         x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
@@ -195,6 +199,12 @@ def process_age_band(x, year_built_column):
         if year_built < 2007:
             return "EPC Age Band is older than Year Built"
 
+    if x["Property Age Band"] == "England and Wales: 2012 onwards":
+        if year_built >= 2012:
+            return "EPC Age Band Matches Year Built"
+        if year_built < 2012:
+            return "EPC Age Band is older than Year Built"
+
     if x["Property Age Band"] == "England and Wales: before 1900":
         if year_built < 1900:
             return "EPC Age Band Matches Year Built"
@@ -206,7 +216,7 @@ def process_age_band(x, year_built_column):
     # so we extract the lower and upper date
     age_band = x["Property Age Band"].split(": ")[1]
     lower_date, upper_date = age_band.split("-")
-    if year_built <= float(upper_date) and year_built <= float(upper_date):
+    if year_built <= float(upper_date) and year_built >= float(lower_date):
         return "EPC Age Band Matches Year Built"
 
     if year_built > float(upper_date):
@@ -269,28 +279,33 @@ def app():
     # - Or the insulation required is loft/cavity (floors should be solid)
 
     # For Westward
-    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
-    # DATA_FILENAME = "WESTWARD - completed list..xlsx"
-    # SHEET_NAME = "Sheet1"
-    # POSTCODE_COLUMN = "WFT EDIT Postcode"
-    # FULLADDRESS_COLUMN = "Address"
-    # ADDRESS1_COLUMN = None
-    # ADDRESS1_METHOD = "house_number_extraction"
-    # ADDRESS_COLS_TO_CONCAT = []
-    # MISSING_POSTCODES_METHOD = None
-    # PROPERTY_YEAR_BUILT = "Build date"
-    # UPRN_COLUMN = "UPRN"
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
-    DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
+    DATA_FILENAME = "WESTWARD - completed list..xlsx"
     SHEET_NAME = "Sheet1"
-    POSTCODE_COLUMN = 'Full Address.1'
-    FULLADDRESS_COLUMN = "Full Address"
+    POSTCODE_COLUMN = "WFT EDIT Postcode"
+    FULLADDRESS_COLUMN = "Address"
     ADDRESS1_COLUMN = None
-    ADDRESS1_METHOD = "first_word"
+    ADDRESS1_METHOD = "house_number_extraction"
     ADDRESS_COLS_TO_CONCAT = []
     MISSING_POSTCODES_METHOD = None
-    PROPERTY_YEAR_BUILT = "Build Date"
-    UPRN_COLUMN = None
+    PROPERTY_YEAR_BUILT = "Build date"
+    UPRN_COLUMN = "UPRN"
+    # If we have the non-intrusives data, this should be true
+    HAS_NON_INTRUSIVES = True
+
+    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+    # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+    # SHEET_NAME = "Sheet1"
+    # POSTCODE_COLUMN = 'Full Address.1'
+    # FULLADDRESS_COLUMN = "Full Address"
+    # ADDRESS1_COLUMN = None
+    # ADDRESS1_METHOD = "first_word"
+    # ADDRESS_COLS_TO_CONCAT = []
+    # MISSING_POSTCODES_METHOD = None
+    # PROPERTY_YEAR_BUILT = "Build Date"
+    # UPRN_COLUMN = None
+    # # If we have the non-intrusives data, this should be true
+    # HAS_NON_INTRUSIVES = True
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
@@ -358,6 +373,20 @@ def app():
 
         asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn)
 
+    # We attempt to process the year built column
+    if PROPERTY_YEAR_BUILT is not None:
+        # We check if we have a datetime
+        if isinstance(asset_list[PROPERTY_YEAR_BUILT].iloc[0], datetime):
+            # We treat any string columns - with common values we see
+            datetime_remap = {
+                "Pre 1900": datetime(year=1899, month=12, day=31),
+            }
+            asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].replace(datetime_remap)
+
+            asset_list[PROPERTY_YEAR_BUILT] = pd.to_datetime(asset_list[PROPERTY_YEAR_BUILT])
+            # Convert this to year
+            asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].dt.year
+
     # We check for duplicated addresses
     asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
     if asset_list["deduper"].duplicated().sum():
@@ -579,11 +608,35 @@ def app():
     # 3) If we have year in the asset list, we flag entries where the built year is different from the
     # EPC Age band
     if PROPERTY_YEAR_BUILT is not None:
-        raise Exception("THIS WAS WRONG!")
         asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
             lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
         )
 
+    if HAS_NON_INTRUSIVES:
+        # Empty cavity:
+        # 1) Has been flagged on the non-intrusives as being empty or partially filled
+        # 2) The age is before 1995
+        # 3) Remove anything that likley has access issues
+        asset_list["Suitable for Cavity Fill"] = (
+            (asset_list["Construction"] == "CAVITY") &
+            asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) &
+            (
+                (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995
+            )
+        )
+
+        # asset_list["Suitable for Extraction"] =
+        asset_list[
+            (asset_list["Construction"] == "Cavity") &
+            asset_list["Insulated"].isin(["RETRO DRILLED"]) &
+            (
+                (asset_list[PROPERTY_YEAR_BUILT] <= 1995)
+            ) &
+            (
+                asset_list[]
+            )
+        ]
+
     # 4) Flag properties that look like they're good candidates for solar installs
     # Firstly, flag if the fabric is completely done
 

From 8432b7d202c24962bae64b04023600de13a6a03d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 11:50:28 +0000
Subject: [PATCH 36/72] creating the asset list class

---
 asset_list/AssetList.py          |  64 ++++++++++++
 etl/route_march_data_pull/app.py | 166 +++++++++++++++++++++----------
 2 files changed, 180 insertions(+), 50 deletions(-)
 create mode 100644 asset_list/AssetList.py

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
new file mode 100644
index 00000000..2a16e82f
--- /dev/null
+++ b/asset_list/AssetList.py
@@ -0,0 +1,64 @@
+import os
+import pandas as pd
+
+
+class AssetList:
+    """
+    This class is used to standardise asset lists so that we can process the core information in a consistent manner.
+    """
+
+    # These are the accepted methods we have for cleaning the address1 column
+    ADDRESS_1_CLEANING_METHODS = [
+        "first_two_words",  # This method will split on the fist two words, where the separator is a space
+        "first_word",  # This method will split on the first word, where the separator is a space
+        "house_number_extraction",  # This method will use the NLP model in SearchEPC to extract the housenumber
+        "address1_extraction"  # This method will use the NLP model to extract address1
+    ]
+
+    def __init__(
+        self,
+        local_filepath,
+        sheet_name,
+        address1_colname,
+        postcode_colname,
+        full_address_colname,
+        full_address_cols_to_concat=None,
+        missing_postcodes_method=None,
+        landlord_year_built=None,
+        landlord_uprn=None,
+        header=0
+    ):
+        self.local_filepath = local_filepath
+        self.sheet_name = sheet_name
+        self.standardised_asset_list = None
+        # Read in the data
+        self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
+
+        # We detect the presence of the non-intrusive columns
+        self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
+
+        # Names of columns
+        self.address1_colname = address1_colname
+        self.postcode_colname = postcode_colname
+        self.full_address_colname = full_address_colname
+        self.landlord_year_built = landlord_year_built
+        self.landlord_uprn = landlord_uprn
+
+        # parameters for cleaning
+        self.full_address_cols_to_concat = full_address_cols_to_concat
+        self.missing_postcodes_method = missing_postcodes_method
+
+    def standardise(self):
+        """
+        This function is used to standardise the asset list
+        :return: standardised asset list
+        """
+
+        # We keep just the columns we care about and will work through the various columns and standardise
+        self.standardised_asset_list = self.raw_asset_list[
+            [
+
+            ]
+        ]
+
+        raise NotImplementedError
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 57239989..06082774 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -5,6 +5,7 @@ import pandas as pd
 import numpy as np
 from tqdm import tqdm
 from datetime import datetime
+from asset_list.AssetList import AssetList
 
 from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
@@ -172,60 +173,107 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t
     raise ValueError(f"Method {method} not recognized")
 
 
-def process_age_band(x, year_built_column):
-    if isinstance(x[year_built_column], datetime):
-        year_built = x[year_built_column].year
-    else:
-        year_built = float(x[year_built_column])
+def process_age_band(asset_list, year_built_column):
+    processed_age_band = []
+    for _, x in asset_list.iterrows():
 
-    if pd.isnull(x["Property Age Band"]) or (
-        x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
-    ) or pd.isnull(year_built):
-        return "No EPC Age Band"
+        if pd.isnull(x["Property Age Band"]) or (
+            x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
+        ):
+            processed_age_band.append({
+                "row_id": x["row_id"],
+                "epc_year_lower_bound": None,
+                "epc_year_upper_bound": None,
+                "Does Age Match EPC Age Band?": "No EPC Age Band"
+            })
+            continue
 
-    # We check if we have a numeric data
-    if x["Property Age Band"].isdigit():
-        if year_built == float(x["Property Age Band"]):
-            return "EPC Age Band Matches Year Built"
-        if year_built > float(x["Property Age Band"]):
-            return "EPC Age Band is older than Year Built"
-        if year_built < float(x["Property Age Band"]):
-            return "EPC Age Band is newer than Year Built"
+        # We exatract the upper and lower bounds
+        if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]:
+            year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012
 
-    # Handle specific case
-    if x["Property Age Band"] == "England and Wales: 2007 onwards":
-        if year_built >= 2007:
-            return "EPC Age Band Matches Year Built"
-        if year_built < 2007:
-            return "EPC Age Band is older than Year Built"
+            if pd.isnull(x[year_built_column]):
+                age_band_matches = "No Year Built From Landlord"
+            else:
+                age_band_matches = (
+                    "EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound
+                    else "EPC Age Band is older than Year Built"
+                )
 
-    if x["Property Age Band"] == "England and Wales: 2012 onwards":
-        if year_built >= 2012:
-            return "EPC Age Band Matches Year Built"
-        if year_built < 2012:
-            return "EPC Age Band is older than Year Built"
+            processed_age_band.append(
+                {
+                    "row_id": x["row_id"],
+                    "epc_year_lower_bound": year_lower_bound,
+                    "epc_year_upper_bound": None,
+                    "Does Age Match EPC Age Band?": age_band_matches
+                }
+            )
+            continue
 
-    if x["Property Age Band"] == "England and Wales: before 1900":
-        if year_built < 1900:
-            return "EPC Age Band Matches Year Built"
-        if year_built >= 1900:
-            return "EPC Age Band is newer than Year Built"
+        if x["Property Age Band"] == "England and Wales: before 1900":
 
-    # Age band will be formatted as such:
-    # 'England and Wales: {upper date}-{lower date}'
-    # so we extract the lower and upper date
-    age_band = x["Property Age Band"].split(": ")[1]
-    lower_date, upper_date = age_band.split("-")
-    if year_built <= float(upper_date) and year_built >= float(lower_date):
-        return "EPC Age Band Matches Year Built"
+            if pd.isnull(x[year_built_column]):
+                age_band_matches = "No Year Built From Landlord"
+            else:
+                age_band_matches = (
+                    "EPC Age Band Matches Year Built" if x[year_built_column] < 1900
+                    else "EPC Age Band is newer than Year Built"
+                )
 
-    if year_built > float(upper_date):
-        return "EPC Age Band is older than Year Built"
+            processed_age_band.append(
+                {
+                    "row_id": x["row_id"],
+                    "epc_year_lower_bound": None,
+                    "epc_year_upper_bound": 1899,
+                    "Does Age Match EPC Age Band?": age_band_matches
+                }
+            )
+            continue
 
-    if year_built < float(upper_date):
-        return "EPC Age Band is newer than Year Built"
+        if x["Property Age Band"].isdigit():
 
-    raise Exception("Should not reach here")
+            if pd.isnull(x[year_built_column]):
+                age_band_matches = "No Year Built From Landlord"
+            else:
+                age_band_matches = (
+                    "EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"])
+                    else "EPC Age Band is different from Year Built"
+                )
+
+            processed_age_band.append(
+                {
+                    "row_id": x["row_id"],
+                    "epc_year_lower_bound": int(x["Property Age Band"]),
+                    "epc_year_upper_bound": int(x["Property Age Band"]),
+                    "Does Age Match EPC Age Band?": age_band_matches
+                }
+            )
+            continue
+
+        # Oherwise, we extract the upper and lower bounds
+        age_band = x["Property Age Band"].split(": ")[1]
+        lower_date, upper_date = age_band.split("-")
+
+        age_band_matches = (
+            "EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and (
+                x[year_built_column] <= float(upper_date)
+            )
+            else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date)
+            else "EPC Age Band is newer than Year Built"
+        )
+
+        processed_age_band.append(
+            {
+                "row_id": x["row_id"],
+                "epc_year_lower_bound": int(lower_date),
+                "epc_year_upper_bound": int(upper_date),
+                "Does Age Match EPC Age Band?": age_band_matches
+            }
+        )
+
+    processed_age_band = pd.DataFrame(processed_age_band)
+
+    return processed_age_band
 
 
 def app():
@@ -282,16 +330,27 @@ def app():
     DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
     DATA_FILENAME = "WESTWARD - completed list..xlsx"
     SHEET_NAME = "Sheet1"
+
     POSTCODE_COLUMN = "WFT EDIT Postcode"
     FULLADDRESS_COLUMN = "Address"
     ADDRESS1_COLUMN = None
     ADDRESS1_METHOD = "house_number_extraction"
+
     ADDRESS_COLS_TO_CONCAT = []
     MISSING_POSTCODES_METHOD = None
     PROPERTY_YEAR_BUILT = "Build date"
     UPRN_COLUMN = "UPRN"
     # If we have the non-intrusives data, this should be true
     HAS_NON_INTRUSIVES = True
+    PROPERTY_TYPE_COLUMN = "Location type"  # This will be used to identify and remove bedsits
+
+    invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
+
+    asset_list = AssetList(
+        local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
+        header=0,
+        sheet_name=SHEET_NAME
+    )
 
     # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
     # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
@@ -608,8 +667,10 @@ def app():
     # 3) If we have year in the asset list, we flag entries where the built year is different from the
     # EPC Age band
     if PROPERTY_YEAR_BUILT is not None:
-        asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
-            lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
+        # We process the age band and merge it on
+        processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT)
+        asset_list = asset_list.merge(
+            processed_age_band, how="left", on="row_id"
         )
 
     if HAS_NON_INTRUSIVES:
@@ -621,7 +682,12 @@ def app():
             (asset_list["Construction"] == "CAVITY") &
             asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) &
             (
-                (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995
+                # Shold we defer to the year built provided by the HA?
+                (asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995)
+            ) &
+            (
+                # We check if the property type column contains one of the invalid property types
+                ~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary))
             )
         )
 
@@ -633,9 +699,9 @@ def app():
                 (asset_list[PROPERTY_YEAR_BUILT] <= 1995)
             ) &
             (
-                asset_list[]
+                asset_list[PROPERTY_TYPE_COLUMN]
             )
-        ]
+            ]
 
     # 4) Flag properties that look like they're good candidates for solar installs
     # Firstly, flag if the fabric is completely done

From 7e9347e530cc52fe38ceef66163447d6fd556b5e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 12:53:09 +0000
Subject: [PATCH 37/72] setting up libpostal

---
 .idea/Model.iml                          |   2 +-
 .idea/misc.xml                           |   2 +-
 asset_list/AssetList.py                  |  71 +++++++++-
 asset_list/README.md                     | 172 +++++++++++++++++++++++
 asset_list/requirements.txt              |   3 +
 asset_list/tests/test_standardisation.py |   9 ++
 etl/route_march_data_pull/app.py         |  18 ++-
 7 files changed, 272 insertions(+), 5 deletions(-)
 create mode 100644 asset_list/README.md
 create mode 100644 asset_list/requirements.txt
 create mode 100644 asset_list/tests/test_standardisation.py

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 762580d9..96ad7a95 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index c916a158..fb10c6b0 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 2a16e82f..35da9c3b 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -1,5 +1,10 @@
 import os
+import usaddress
 import pandas as pd
+from utils.logger import setup_logger
+from backend.SearchEpc import SearchEpc
+
+logger = setup_logger()
 
 
 class AssetList:
@@ -15,6 +20,15 @@ class AssetList:
         "address1_extraction"  # This method will use the NLP model to extract address1
     ]
 
+    STANDARD_PROPERTY_TYPES = [
+        "house",
+        "flat",
+        "bungalow",
+        "maisonette",
+        "park home",
+        "block house",
+    ]
+
     def __init__(
         self,
         local_filepath,
@@ -26,6 +40,10 @@ class AssetList:
         missing_postcodes_method=None,
         landlord_year_built=None,
         landlord_uprn=None,
+        landlord_property_type=None,
+        landlord_wall_construction=None,
+        landlord_heating_system=None,
+        landlord_existing_pv=None,
         header=0
     ):
         self.local_filepath = local_filepath
@@ -43,21 +61,72 @@ class AssetList:
         self.full_address_colname = full_address_colname
         self.landlord_year_built = landlord_year_built
         self.landlord_uprn = landlord_uprn
+        self.landlord_property_type = landlord_property_type
+        self.landlord_wall_construction = landlord_wall_construction
+        self.landlord_heating_system = landlord_heating_system
+        self.landlord_existing_pv = landlord_existing_pv
 
         # parameters for cleaning
         self.full_address_cols_to_concat = full_address_cols_to_concat
         self.missing_postcodes_method = missing_postcodes_method
 
+        self.debug_information = {
+            "property_type": None,
+            "wall_construction": None,
+            "heating_system": None,
+            "existing_pv": None
+        }
+
+    @classmethod
+    def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"):
+
+        if method not in cls.ADDRESS_1_CLEANING_METHODS:
+            raise ValueError(f"Method {method} for producing address1 not recognized")
+
+        if method == "first_two_words":
+            asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+            return asset_list
+
+        if method == "first_word":
+            asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
+            return asset_list
+
+        if method == "house_number_extraction":
+            asset_list["address1_extracted"] = asset_list.apply(
+                lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
+                axis=1
+            )
+            return asset_list
+
+        if method == "address1_extraction":
+
+            x = asset_list_df[FULLADDRESS_COLUMN].values[0]
+            parsed = usaddress.parse(x)
+
+            def extract_address_1():
+
+
+    raise ValueError(f"Method {method} not recognized")
+
+    @staticmethod
+    def _address1_extraction(x):
+
+
     def standardise(self):
         """
         This function is used to standardise the asset list
         :return: standardised asset list
         """
 
+        if self.address1_colname is None:
+            # If we do not have this, we produce it
+
+
         # We keep just the columns we care about and will work through the various columns and standardise
         self.standardised_asset_list = self.raw_asset_list[
             [
-
+                self.address1_colname, self.postcode_colname, self.full_address_colname,
+                self.landlord_year_built, self.landlord_uprn, self.landlord_property_type
             ]
         ]
 
diff --git a/asset_list/README.md b/asset_list/README.md
new file mode 100644
index 00000000..1bf734a4
--- /dev/null
+++ b/asset_list/README.md
@@ -0,0 +1,172 @@
+# libpostal Installation Guide for macOS M1
+
+## Overview
+
+`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide
+provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python.
+
+---
+
+## 📌 Prerequisites
+
+Before installing `libpostal`, ensure you have the necessary dependencies installed.
+
+### **1️⃣ Install Required Dependencies**
+
+Open a terminal and run:
+
+```bash
+brew install curl autoconf automake libtool pkg-config
+```
+
+### **2️⃣ Clone the libpostal Repository**
+
+```bash
+git clone https://github.com/openvenues/libpostal.git
+cd libpostal
+```
+
+### **3️⃣ Run Bootstrap Script**
+
+```bash
+./bootstrap.sh
+```
+
+### **4️⃣ Configure the Build (Important for M1 Macs)**
+
+Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility.
+
+```bash
+./configure --disable-sse2 --datadir=/usr/local/libpostal_data
+```
+
+*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)*
+
+### **5️⃣ Compile and Install**
+
+```bash
+make -j$(sysctl -n hw.ncpu)
+sudo make install
+```
+
+### **6️⃣ Install Python Bindings**
+
+Once `libpostal` is installed, install the Python package:
+
+```bash
+pip install postal
+```
+
+---
+
+## ✅ **Verify Installation**
+
+To check if `libpostal` was installed successfully, run:
+
+```bash
+python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))"
+```
+
+**Expected Output:**
+
+```
+[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')]
+```
+
+---
+
+## 📌 **Usage Example in Python**
+
+### **Address Parsing**
+
+```python
+from postal.parser import parse
+
+address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL"
+parsed_address = dict(parse(address))
+
+print(parsed_address)
+```
+
+**Expected Output:**
+
+```python
+{
+    'house_number': '23',
+    'road': 'Clifton Hill',
+    'city': 'Newtown',
+    'city': 'Exeter',
+    'postcode': 'EX1 2DL'
+}
+```
+
+### **Address Normalization**
+
+```python
+from postal.normalize import normalize_string
+
+address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL"
+normalized = normalize_string(address)
+
+print(normalized)
+```
+
+---
+
+## 📌 **Troubleshooting**
+
+### **1️⃣ libpostal Not Found?**
+
+If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure:
+
+- You ran `sudo make install`
+- Your Python environment recognizes `postal`. Try:
+  ```bash
+  pip install postal --no-cache-dir
+  ```
+- If using a virtual environment (`venv`), activate it before running Python.
+
+### **2️⃣ Compilation Issues on macOS?**
+
+If `make` fails, try running:
+
+```bash
+brew reinstall autoconf automake libtool pkg-config
+```
+
+Then restart the installation process.
+
+### **3️⃣ Can't Find libpostal Data Directory?**
+
+Ensure `libpostal_data` exists in the correct directory:
+
+```bash
+ls /usr/local/libpostal_data
+```
+
+If missing, re-run `./configure` with the correct path.
+
+---
+
+## 🛠 **Uninstallation**
+
+To remove `libpostal`, run:
+
+```bash
+sudo rm -rf /usr/local/lib/libpostal*
+sudo rm -rf /usr/local/include/libpostal*
+rm -rf ~/libpostal
+pip uninstall postal
+```
+
+---
+
+## 📌 **Additional Resources**
+
+- [Libpostal GitHub](https://github.com/openvenues/libpostal)
+- [Libpostal Python Bindings](https://pypi.org/project/postal/)
+- [Homebrew](https://brew.sh/)
+
+---
+
+### 🎉 You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
new file mode 100644
index 00000000..d77c8a58
--- /dev/null
+++ b/asset_list/requirements.txt
@@ -0,0 +1,3 @@
+postal
+pandas
+usaddress
\ No newline at end of file
diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py
new file mode 100644
index 00000000..f0e6ce11
--- /dev/null
+++ b/asset_list/tests/test_standardisation.py
@@ -0,0 +1,9 @@
+from asset_list.AssetList import AssetList
+
+
+def test_address1_extraction():
+    example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL'
+
+    AssetList._extract_address1(
+        example,
+    )
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 06082774..74dc28e0 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -346,10 +346,24 @@ def app():
 
     invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
 
-    asset_list = AssetList(
+    self = AssetList(
         local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
         header=0,
-        sheet_name=SHEET_NAME
+        sheet_name=SHEET_NAME,
+        address1_colname=ADDRESS1_COLUMN,
+        postcode_colname=POSTCODE_COLUMN,
+        full_address_colname=FULLADDRESS_COLUMN,
+        full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
+        missing_postcodes_method=MISSING_POSTCODES_METHOD,
+        landlord_year_built=PROPERTY_YEAR_BUILT,
+        landlord_uprn=UPRN_COLUMN,
+        landlord_property_type=PROPERTY_TYPE_COLUMN,
+        landlord_wall_construction="Wall Construction (EPC)",
+        landlord_heating_system="Heat Source",
+        landlord_existing_pv="PV (Y/N)"
+    )
+    self.standardised_asset_list(
+        # In here, we might want to pass some specific remaps
     )
 
     # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"

From cb0194c3b96f839e5050073eb76e2f23e822c87f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 14:12:57 +0000
Subject: [PATCH 38/72] working on address extraction

---
 asset_list/AssetList.py                  | 119 +++++++++++++---
 asset_list/README.md                     | 172 -----------------------
 asset_list/requirements.txt              |   7 +-
 asset_list/tests/test_standardisation.py |   9 +-
 backend/SearchEpc.py                     |  14 +-
 backend/tests/test_search_epc.py         |   9 ++
 etl/route_march_data_pull/app.py         |   2 +
 7 files changed, 130 insertions(+), 202 deletions(-)
 delete mode 100644 asset_list/README.md

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 35da9c3b..1a3f6180 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -17,7 +17,7 @@ class AssetList:
         "first_two_words",  # This method will split on the fist two words, where the separator is a space
         "first_word",  # This method will split on the first word, where the separator is a space
         "house_number_extraction",  # This method will use the NLP model in SearchEPC to extract the housenumber
-        "address1_extraction"  # This method will use the NLP model to extract address1
+        # "address1_extraction"  # This method will use the NLP model to extract address1
     ]
 
     STANDARD_PROPERTY_TYPES = [
@@ -29,6 +29,19 @@ class AssetList:
         "block house",
     ]
 
+    # Standard column Names
+    STANDARD_ADDRESS_1 = "domna_address_1"
+    STANDARD_POSTCODE = "domna_postcode"
+    STANDARD_FULL_ADDRESS = "domna_full_address"
+    STANDARD_YEAR_BUILT = "domna_year_built"
+    STANDARD_UPRN = "ordnance_survey_uprn"
+    STANDARD_PROPERTY_TYPE = "landlord_property_type"
+    STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
+    STANDARD_HEATING_SYSTEM = "landlord_heating_system"
+    STANDARD_EXISTING_PV = "landlord_existing_pv"
+
+    DOMNA_PROPERTY_ID = "domna_property_id"
+
     def __init__(
         self,
         local_filepath,
@@ -36,8 +49,10 @@ class AssetList:
         address1_colname,
         postcode_colname,
         full_address_colname,
+        landlord_property_id=None,
         full_address_cols_to_concat=None,
         missing_postcodes_method=None,
+        address1_extraction_method=None,
         landlord_year_built=None,
         landlord_uprn=None,
         landlord_property_type=None,
@@ -48,14 +63,15 @@ class AssetList:
     ):
         self.local_filepath = local_filepath
         self.sheet_name = sheet_name
-        self.standardised_asset_list = None
         # Read in the data
         self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
+        self.standardised_asset_list = self.raw_asset_list.copy()
 
         # We detect the presence of the non-intrusive columns
         self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
 
         # Names of columns
+        self.landlord_property_id = landlord_property_id
         self.address1_colname = address1_colname
         self.postcode_colname = postcode_colname
         self.full_address_colname = full_address_colname
@@ -69,6 +85,7 @@ class AssetList:
         # parameters for cleaning
         self.full_address_cols_to_concat = full_address_cols_to_concat
         self.missing_postcodes_method = missing_postcodes_method
+        self.address1_extraction_method = address1_extraction_method
 
         self.debug_information = {
             "property_type": None,
@@ -77,40 +94,50 @@ class AssetList:
             "existing_pv": None
         }
 
-    @classmethod
-    def _extract_address1(cls, asset_list, full_address_col, postcode_col, method="first_two_words"):
+    def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
 
-        if method not in cls.ADDRESS_1_CLEANING_METHODS:
+        if method not in self.ADDRESS_1_CLEANING_METHODS:
             raise ValueError(f"Method {method} for producing address1 not recognized")
 
         if method == "first_two_words":
-            asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+            asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
             return asset_list
 
         if method == "first_word":
-            asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
+            asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0]
             return asset_list
 
         if method == "house_number_extraction":
-            asset_list["address1_extracted"] = asset_list.apply(
+            asset_list[self.address1_colname] = asset_list.apply(
                 lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
                 axis=1
             )
             return asset_list
 
-        if method == "address1_extraction":
-
-            x = asset_list_df[FULLADDRESS_COLUMN].values[0]
-            parsed = usaddress.parse(x)
-
-            def extract_address_1():
-
-
-    raise ValueError(f"Method {method} not recognized")
+        raise ValueError(f"Method {method} not recognized")
 
     @staticmethod
     def _address1_extraction(x):
+        pass
 
+    def create_property_id(self):
+        """
+        This function creates the domna property ID, which is simply a hash of the full address and postcode
+        We want all figures to be positive
+        :return:
+        """
+        import sys
+        self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
+            self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
+            self.postcode_colname]
+        ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)
+
+    @staticmethod
+    def _strip_postcode_from_full_address(full_address, postcode):
+        cleaned = full_address.replace(postcode, "")
+        # Remove any trailing commas and spaces
+        cleaned = cleaned.rstrip(", ").strip(",").strip()
+        return cleaned
 
     def standardise(self):
         """
@@ -118,15 +145,63 @@ class AssetList:
         :return: standardised asset list
         """
 
-        if self.address1_colname is None:
-            # If we do not have this, we produce it
+        # Remove rows without a postcode
+        if self.postcode_colname is not None:
+            self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname])
 
+        # We clean up portential non-breaking spaces, and double spaces
+        for col in [
+            c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if
+            c is not None
+        ]:
+            self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str)
+            self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False)
+            self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('  ', ' ', regex=False)
+
+        if self.address1_colname is None:
+            if self.address1_extraction_method is None:
+                raise ValueError("Missing address 1 - please specify an extraction method")
+            self.address1_colname = self.STANDARD_ADDRESS_1
+            # If we do not have this, we produce it
+            self.standardised_asset_list = self._extract_address1(
+                asset_list=self.standardised_asset_list,
+                full_address_col=self.full_address_colname,
+                postcode_col=self.postcode_colname,
+                method=self.address1_extraction_method
+            )
+
+        if self.full_address_colname is None:
+            if not self.full_address_cols_to_concat:
+                raise ValueError("Missing full address - please specify columns to concatenate")
+            self.full_address_colname = self.STANDARD_FULL_ADDRESS
+            self.standardised_asset_list[self.full_address_colname] = (
+                self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1)
+            )
+        else:
+
+            # Make sure to strip the postcode out of the full address
+            self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply(
+                lambda x: self._strip_postcode_from_full_address(
+                    full_address=x[self.full_address_colname],
+                    postcode=x[self.postcode_colname]
+                ),
+                axis=1
+            )
+
+        # We create the domna property id
+        self.create_property_id()
 
         # We keep just the columns we care about and will work through the various columns and standardise
-        self.standardised_asset_list = self.raw_asset_list[
+        self.standardised_asset_list = self.standardised_asset_list[
             [
-                self.address1_colname, self.postcode_colname, self.full_address_colname,
-                self.landlord_year_built, self.landlord_uprn, self.landlord_property_type
+                self.landlord_property_id,
+                self.DOMNA_PROPERTY_ID,
+                self.address1_colname,
+                self.postcode_colname,
+                self.full_address_colname,
+                self.landlord_year_built,
+                self.landlord_uprn,
+                self.landlord_property_type,
             ]
         ]
 
diff --git a/asset_list/README.md b/asset_list/README.md
deleted file mode 100644
index 1bf734a4..00000000
--- a/asset_list/README.md
+++ /dev/null
@@ -1,172 +0,0 @@
-# libpostal Installation Guide for macOS M1
-
-## Overview
-
-`libpostal` is a fast, open-source address parsing and normalization library, designed for global addresses. This guide
-provides detailed steps to install `libpostal` on macOS with Apple Silicon (M1/M2) and use it with Python.
-
----
-
-## 📌 Prerequisites
-
-Before installing `libpostal`, ensure you have the necessary dependencies installed.
-
-### **1️⃣ Install Required Dependencies**
-
-Open a terminal and run:
-
-```bash
-brew install curl autoconf automake libtool pkg-config
-```
-
-### **2️⃣ Clone the libpostal Repository**
-
-```bash
-git clone https://github.com/openvenues/libpostal.git
-cd libpostal
-```
-
-### **3️⃣ Run Bootstrap Script**
-
-```bash
-./bootstrap.sh
-```
-
-### **4️⃣ Configure the Build (Important for M1 Macs)**
-
-Since M1 chips do not support SSE2 natively, you **must** disable SSE2 for compatibility.
-
-```bash
-./configure --disable-sse2 --datadir=/usr/local/libpostal_data
-```
-
-*(You can replace `/usr/local/libpostal_data` with another directory that has a few GB of space.)*
-
-### **5️⃣ Compile and Install**
-
-```bash
-make -j$(sysctl -n hw.ncpu)
-sudo make install
-```
-
-### **6️⃣ Install Python Bindings**
-
-Once `libpostal` is installed, install the Python package:
-
-```bash
-pip install postal
-```
-
----
-
-## ✅ **Verify Installation**
-
-To check if `libpostal` was installed successfully, run:
-
-```bash
-python -c "import postal; print(postal.parser.parse('23 Clifton Hill, Newtown, Exeter, EX1 2DL'))"
-```
-
-**Expected Output:**
-
-```
-[('23', 'house_number'), ('Clifton Hill', 'road'), ('Newtown', 'city'), ('Exeter', 'city'), ('EX1 2DL', 'postcode')]
-```
-
----
-
-## 📌 **Usage Example in Python**
-
-### **Address Parsing**
-
-```python
-from postal.parser import parse
-
-address = "23 Clifton Hill, Newtown, Exeter, EX1 2DL"
-parsed_address = dict(parse(address))
-
-print(parsed_address)
-```
-
-**Expected Output:**
-
-```python
-{
-    'house_number': '23',
-    'road': 'Clifton Hill',
-    'city': 'Newtown',
-    'city': 'Exeter',
-    'postcode': 'EX1 2DL'
-}
-```
-
-### **Address Normalization**
-
-```python
-from postal.normalize import normalize_string
-
-address = "Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL"
-normalized = normalize_string(address)
-
-print(normalized)
-```
-
----
-
-## 📌 **Troubleshooting**
-
-### **1️⃣ libpostal Not Found?**
-
-If you encounter an error like `ModuleNotFoundError: No module named 'postal'`, make sure:
-
-- You ran `sudo make install`
-- Your Python environment recognizes `postal`. Try:
-  ```bash
-  pip install postal --no-cache-dir
-  ```
-- If using a virtual environment (`venv`), activate it before running Python.
-
-### **2️⃣ Compilation Issues on macOS?**
-
-If `make` fails, try running:
-
-```bash
-brew reinstall autoconf automake libtool pkg-config
-```
-
-Then restart the installation process.
-
-### **3️⃣ Can't Find libpostal Data Directory?**
-
-Ensure `libpostal_data` exists in the correct directory:
-
-```bash
-ls /usr/local/libpostal_data
-```
-
-If missing, re-run `./configure` with the correct path.
-
----
-
-## 🛠 **Uninstallation**
-
-To remove `libpostal`, run:
-
-```bash
-sudo rm -rf /usr/local/lib/libpostal*
-sudo rm -rf /usr/local/include/libpostal*
-rm -rf ~/libpostal
-pip uninstall postal
-```
-
----
-
-## 📌 **Additional Resources**
-
-- [Libpostal GitHub](https://github.com/openvenues/libpostal)
-- [Libpostal Python Bindings](https://pypi.org/project/postal/)
-- [Homebrew](https://brew.sh/)
-
----
-
-### 🎉 You’re all set! Now you can use `libpostal` to parse and clean address data efficiently. 🚀
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
index d77c8a58..d6d64471 100644
--- a/asset_list/requirements.txt
+++ b/asset_list/requirements.txt
@@ -1,3 +1,8 @@
 postal
 pandas
-usaddress
\ No newline at end of file
+usaddress
+pydantic-settings==2.6.0
+epc-api-python==1.0.2
+fuzzywuzzy
+boto3
+openpyxl
\ No newline at end of file
diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py
index f0e6ce11..1a083bbc 100644
--- a/asset_list/tests/test_standardisation.py
+++ b/asset_list/tests/test_standardisation.py
@@ -1,9 +1,12 @@
 from asset_list.AssetList import AssetList
+from backend.SearchEpc import
+
 
 
 def test_address1_extraction():
     example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL'
 
-    AssetList._extract_address1(
-        example,
-    )
+    # AssetList._extract_address1(
+    #     example,
+    # )
+    pass
diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index e8a9dfaa..79a041ec 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -208,9 +208,14 @@ class SearchEpc:
         try:
             # Updated regex to catch house numbers including alphanumeric ones
             pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)'
-            match = re.search(pattern, address)
-            if match:
-                return next(g for g in match.groups() if g is not None)
+            match1 = re.search(pattern, address)
+            if match1:
+                return next(g for g in match1.groups() if g is not None)
+
+            pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)'
+            match2 = re.search(pattern2, address)
+            if match2:
+                return match2.group(2)
 
             parsed = usaddress.parse(address)
             # First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected
@@ -221,7 +226,8 @@ class SearchEpc:
                             continue
                         if part == postcode.split(" ")[1]:
                             continue
-                    return part  # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
+                    return part.rstrip(
+                        ",")  # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
                     # number
 
             # Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found
diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py
index 3b2e2a5b..562585ad 100644
--- a/backend/tests/test_search_epc.py
+++ b/backend/tests/test_search_epc.py
@@ -48,3 +48,12 @@ class TestSearchEpcIntegration:
         assert epc_searcher.newest_epc["lmk-key"] == lmk_key
         assert epc_searcher.newest_epc["uprn"] == uprn
         assert len(epc_searcher.older_epcs) == n_old_epcs
+
+    def test_search_housenumber(self):
+        eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter'
+        res1 = SearchEpc.get_house_number(eg1, None)
+        assert res1 == "A11"
+
+        eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL'
+        res2 = SearchEpc.get_house_number(eg2, None)
+        assert res2 == "A9"
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 74dc28e0..fcf11765 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -352,9 +352,11 @@ def app():
         sheet_name=SHEET_NAME,
         address1_colname=ADDRESS1_COLUMN,
         postcode_colname=POSTCODE_COLUMN,
+        landlord_property_id="UPRN",
         full_address_colname=FULLADDRESS_COLUMN,
         full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
         missing_postcodes_method=MISSING_POSTCODES_METHOD,
+        address1_extraction_method=ADDRESS1_METHOD,
         landlord_year_built=PROPERTY_YEAR_BUILT,
         landlord_uprn=UPRN_COLUMN,
         landlord_property_type=PROPERTY_TYPE_COLUMN,

From 0a643d80adb412ea4069664cc12efaf9e71fad42 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 14:21:29 +0000
Subject: [PATCH 39/72] building out multi-unit flagging

---
 asset_list/AssetList.py                  | 16 ++++++++++++++--
 asset_list/tests/test_standardisation.py | 11 ++---------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 1a3f6180..fde24fe2 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -1,5 +1,4 @@
-import os
-import usaddress
+import re
 import pandas as pd
 from utils.logger import setup_logger
 from backend.SearchEpc import SearchEpc
@@ -42,6 +41,9 @@ class AssetList:
 
     DOMNA_PROPERTY_ID = "domna_property_id"
 
+    # Regular expression for identifying if the address might point to multiple units
+    MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b')
+
     def __init__(
         self,
         local_filepath,
@@ -139,6 +141,14 @@ class AssetList:
         cleaned = cleaned.rstrip(", ").strip(",").strip()
         return cleaned
 
+    @classmethod
+    def _identify_multi_address(cls, address):
+        # We check if the address is comma separated
+        if "," in address:
+            address1_section = address.split(",")[0]
+            # We look for string in the form (x-y)
+            return bool(cls.MULTI_UNIT_REGEX.search(address1_section))
+
     def standardise(self):
         """
         This function is used to standardise the asset list
@@ -205,4 +215,6 @@ class AssetList:
             ]
         ]
 
+        # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y)
+
         raise NotImplementedError
diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py
index 1a083bbc..b6d9a391 100644
--- a/asset_list/tests/test_standardisation.py
+++ b/asset_list/tests/test_standardisation.py
@@ -1,12 +1,5 @@
 from asset_list.AssetList import AssetList
-from backend.SearchEpc import
 
 
-
-def test_address1_extraction():
-    example = 'Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL'
-
-    # AssetList._extract_address1(
-    #     example,
-    # )
-    pass
+def test_multi_unit_address_flagging():
+    assert AssetList._identify_multi_address('Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL')

From ecf8e46c65ae7e09725258bcb578690d1156bf14 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 22:12:29 +0000
Subject: [PATCH 40/72] getting asset list class live

---
 .idea/terraform.xml                    |   6 +
 asset_list/AssetList.py                | 321 +++++++++++++++++++++++--
 asset_list/app.py                      |   1 +
 asset_list/mappings/exising_pv.py      |   8 +
 asset_list/mappings/heating_systems.py |  46 ++++
 asset_list/mappings/property_type.py   |  16 ++
 asset_list/mappings/walls.py           |  38 +++
 asset_list/requirements.txt            |   4 +-
 etl/route_march_data_pull/app.py       |   5 +-
 9 files changed, 420 insertions(+), 25 deletions(-)
 create mode 100644 .idea/terraform.xml
 create mode 100644 asset_list/app.py
 create mode 100644 asset_list/mappings/exising_pv.py
 create mode 100644 asset_list/mappings/heating_systems.py
 create mode 100644 asset_list/mappings/property_type.py
 create mode 100644 asset_list/mappings/walls.py

diff --git a/.idea/terraform.xml b/.idea/terraform.xml
new file mode 100644
index 00000000..cd46a3d3
--- /dev/null
+++ b/.idea/terraform.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="TerraformProjectSettings">
+    <option name="toolPath" value="/opt/homebrew/bin/terraform" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index fde24fe2..e61cc89b 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -1,16 +1,200 @@
+import os
 import re
+from datetime import datetime
+from openai import OpenAI
+import tiktoken
+import numpy as np
 import pandas as pd
+from fuzzywuzzy import process
 from utils.logger import setup_logger
 from backend.SearchEpc import SearchEpc
+import asset_list.mappings.property_type as property_type_mappings
+import asset_list.mappings.walls as walls_mappings
+import asset_list.mappings.heating_systems as heating_mappings
+import asset_list.mappings.exising_pv as existing_pv_mappings
 
 logger = setup_logger()
 
+# OpenAI API Key (set this in your environment variables for security)
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+
+
+class DataRemapper:
+    def __init__(self, standard_values, standard_map=None, max_tokens=1000):
+        """
+        Initialize the remapper with standard values and a predefined mapping.
+
+        :param standard_values: Set of allowed standardized values.
+        :param standard_map: Dictionary of common remappings {raw_value: standard_value}.
+        """
+        self.standard_values = {v.lower() for v in standard_values}  # Normalize to lowercase
+        self.standard_map = {k.lower(): v.lower() for k, v in (standard_map or {}).items()}  # Predefined mappings
+        self.fuzzy_threshold = 90  # Adjust fuzzy matching sensitivity
+        self.ai_model = "gpt-4-turbo"  # Use gpt-3.5-turbo for cheaper processing
+
+        # Tokenizer for counting tokens
+        self.tokenizer = tiktoken.encoding_for_model(self.ai_model)
+
+        # Track token usage and remap dictionary
+        self.total_tokens_used = 0
+        self.total_cost = 0
+        self.remap_dict = {}  # {original_value: standardized_value}
+        self.max_tokens = 1000  # Limit for OpenAI API
+
+        # Memoization for AI calls
+        self.ai_cache = {}  # {tuple(unmapped_values): {original_value: standardized_value}}
+        # Capture the reponse for debugging
+        self.ai_response = None
+
+        # OpenAI pricing (as of Feb 2024)
+        self.pricing = {
+            "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000},
+            "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000},
+        }
+
+        self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
+
+    @staticmethod
+    def clean_string(text):
+        """Basic text cleaning: remove extra spaces, punctuation, and normalize case."""
+        if not isinstance(text, str):
+            return None
+        text = text.strip().lower()
+        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
+        return text
+
+    def fuzzy_match(self, text):
+        """Use fuzzy matching to find the closest standard value."""
+        match, score = process.extractOne(text, self.standard_values) if text else (None, 0)
+        return match if score >= self.fuzzy_threshold else None
+
+    def count_tokens(self, text):
+        """Estimate the number of tokens in a given text."""
+        return len(self.tokenizer.encode(text)) if text else 0
+
+    def ai_standardize(self, unmapped_values):
+        """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization."""
+        if not unmapped_values:
+            return {}
+
+        unmapped_tuple = tuple(sorted(unmapped_values))  # Ensure consistency for memoization
+        if unmapped_tuple in self.ai_cache:
+            return self.ai_cache[unmapped_tuple]  # Return memoized result
+
+        prompt = f"""
+        You are an expert in data classification. Standardize each of these values into one of the categories: 
+        {list(self.standard_values)}. 
+
+        Return only a JSON dictionary where:
+        - The keys are the original values.
+        - The values are the standardized ones.
+
+        Strictly return JSON **without markdown formatting** or extra text.
+
+        Example Output:
+        {{
+            "BLKHOUS": "block house",
+            "BEDSIT": "bedsit"
+        }}
+
+        Values to standardize:
+        {unmapped_values}
+        """
+
+        # Count input tokens
+        input_tokens = self.count_tokens(prompt)
+        if input_tokens > self.max_tokens:
+            raise ValueError("Input tokens exceed the maximum limit.")
+
+        response = self.openai_client.chat.completions.create(
+            model=self.ai_model,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=self.max_tokens,
+            temperature=0.1,
+        )
+
+        output_text = response.choices[0].message.content.strip()
+        output_tokens = self.count_tokens(output_text)  # Count output tokens
+
+        # Track total token usage
+        self.total_tokens_used += input_tokens + output_tokens
+
+        # Estimate cost
+        input_cost = input_tokens * self.pricing[self.ai_model]["input"]
+        output_cost = output_tokens * self.pricing[self.ai_model]["output"]
+        self.total_cost += input_cost + output_cost
+
+        try:
+            # Parse response as dictionary
+            mapping = eval(output_text)  # OpenAI should return a valid dictionary
+        except:
+            mapping = {val: "unknown" for val in unmapped_values}  # Fallback
+
+        # Memoize the AI response
+        self.ai_cache[unmapped_tuple] = mapping
+        # We store the raw AI response for debugging
+        logger.debug(f"AI Response: {mapping}")
+        self.ai_response = output_text
+
+        return mapping
+
+    def standardize_list(self, values_to_remap):
+        """
+        Standardizes a list of values and returns a dictionary {original_value: standardized_value}.
+
+        :param values_to_remap: List of raw values to standardize.
+        :return: Dictionary {original_value: standardized_value}.
+        """
+        unique_values = set(values_to_remap)  # Process only unique values
+
+        unmapped_values = []
+        for value in unique_values:
+            if pd.isna(value):  # Handle NaN values
+                self.remap_dict[value] = "unknown"
+                continue
+
+            cleaned_value = self.clean_string(value)
+
+            # Rule-Based Check (Predefined Mapping)
+            if cleaned_value in self.standard_map:
+                self.remap_dict[value] = self.standard_map[cleaned_value]
+                continue
+
+            # Exact Match in Standard Values
+            if cleaned_value in self.standard_values:
+                self.remap_dict[value] = cleaned_value
+                continue
+
+            # Fuzzy Matching
+            fuzzy_match = self.fuzzy_match(cleaned_value)
+            if fuzzy_match:
+                self.remap_dict[value] = fuzzy_match
+                continue
+
+            # Capture anything that wasn't mapped
+            unmapped_values.append(value)
+
+        # AI Model - remap anything unmapped (batch request)
+        ai_mapping = self.ai_standardize(unmapped_values)
+        self.remap_dict.update(ai_mapping)
+
+        return self.remap_dict
+
+    def report_usage(self):
+        """Prints a summary of token usage and cost."""
+        print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}")
+        print(f"💰 Estimated Cost: ${self.total_cost:.4f}")
+
 
 class AssetList:
     """
     This class is used to standardise asset lists so that we can process the core information in a consistent manner.
     """
 
+    DATETIME_REMAP = {
+        "Pre 1900": datetime(year=1899, month=12, day=31),
+    }
+
     # These are the accepted methods we have for cleaning the address1 column
     ADDRESS_1_CLEANING_METHODS = [
         "first_two_words",  # This method will split on the fist two words, where the separator is a space
@@ -19,15 +203,6 @@ class AssetList:
         # "address1_extraction"  # This method will use the NLP model to extract address1
     ]
 
-    STANDARD_PROPERTY_TYPES = [
-        "house",
-        "flat",
-        "bungalow",
-        "maisonette",
-        "park home",
-        "block house",
-    ]
-
     # Standard column Names
     STANDARD_ADDRESS_1 = "domna_address_1"
     STANDARD_POSTCODE = "domna_postcode"
@@ -44,6 +219,15 @@ class AssetList:
     # Regular expression for identifying if the address might point to multiple units
     MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b')
 
+    # List of columns relating to the non-intrusive data
+    NON_INTRUSIVES_COLNAMES = [
+        "Archetype", "Construction", "Insulated", "Material", "CIGA Check Required",
+        "PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION",
+        "Any further surveyor notes", 'Surveyors Name'
+    ]
+
+    #### Mapping for wall construction
+
     def __init__(
         self,
         local_filepath,
@@ -96,6 +280,8 @@ class AssetList:
             "existing_pv": None
         }
 
+        self.variable_mappings = {}
+
     def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
 
         if method not in self.ADDRESS_1_CLEANING_METHODS:
@@ -149,7 +335,7 @@ class AssetList:
             # We look for string in the form (x-y)
             return bool(cls.MULTI_UNIT_REGEX.search(address1_section))
 
-    def standardise(self):
+    def init_standardise(self):
         """
         This function is used to standardise the asset list
         :return: standardised asset list
@@ -202,19 +388,110 @@ class AssetList:
         self.create_property_id()
 
         # We keep just the columns we care about and will work through the various columns and standardise
-        self.standardised_asset_list = self.standardised_asset_list[
-            [
-                self.landlord_property_id,
-                self.DOMNA_PROPERTY_ID,
-                self.address1_colname,
-                self.postcode_colname,
-                self.full_address_colname,
-                self.landlord_year_built,
-                self.landlord_uprn,
-                self.landlord_property_type,
-            ]
+        variables = [
+            self.landlord_property_id,
+            self.DOMNA_PROPERTY_ID,
+            self.address1_colname,
+            self.postcode_colname,
+            self.full_address_colname,
+            self.landlord_uprn,
+            self.landlord_property_type,
+            self.landlord_year_built,
+            self.landlord_wall_construction,
+            self.landlord_heating_system,
+            self.landlord_existing_pv
         ]
+        rename = {}
+
+        if self.non_intrusives_present:
+            variables += self.NON_INTRUSIVES_COLNAMES
+            rename = {
+                **rename,
+                **dict(
+                    zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES])
+                )
+            }
+
+        self.standardised_asset_list = self.standardised_asset_list[variables].rename(
+            columns=rename
+        )
 
         # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y)
+        self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[
+            self.full_address_colname
+        ].apply(lambda x: self._identify_multi_address(x))
 
-        raise NotImplementedError
+        # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and
+        # we see instances of "average thermal transmittance" in the description
+        self.standardised_asset_list[self.landlord_wall_construction] = np.where(
+            self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains(
+                "average thermal transmittance"
+            ),
+            "new build - average thermal transmittance",
+            self.standardised_asset_list[self.landlord_wall_construction]
+        )
+
+        # Clear our build year column
+
+        # We attempt to process the year built column
+        if self.landlord_year_built is not None:
+            # We check if we have a datetime
+            if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime):
+                # We treat any string columns - with common values we see
+                self.standardised_asset_list[self.landlord_year_built] = (
+                    self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP)
+                )
+
+                self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime(
+                    self.standardised_asset_list[self.landlord_year_built]
+                )
+                # Convert this to year
+                self.standardised_asset_list[self.landlord_year_built] = (
+                    self.standardised_asset_list[self.landlord_year_built].dt.year
+                )
+            else:
+                raise NotImplementedError("Year built column must be a datetime - implement me")
+
+        # We now create standard lookups
+        to_remap = {
+            self.landlord_property_type: {
+                "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES,
+                "standard_map": property_type_mappings.PROPERTY_MAPPING
+            },
+            self.landlord_wall_construction: {
+                "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS,
+                "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS
+            },
+            self.landlord_heating_system: {
+                "standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS,
+                "standard_map": heating_mappings.HEATING_MAPPINGS
+            },
+            self.landlord_existing_pv: {
+                "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV,
+                "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS
+            }
+        }
+
+        for variable, config in to_remap.items():
+            logger.info("Standardising variable: %s", variable)
+            values_to_remap = self.standardised_asset_list[variable].unique()
+            # We want to map this to our standardised list of property types we're interested in
+            remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"])
+            remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist())
+            self.variable_mappings[variable] = remap_dictionary
+
+        # We now print out the variable mappings, which can be reviewed by the user, before the final standardised
+        # asset list is returned
+
+    def apply_standardiation(self, override_empty_mappings=False):
+        """
+        This function applies the standardisation to the asset list
+        :param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant
+        if there are no categories which need remapping which is highly unlikely
+        :return:
+        """
+        if not self.variable_mappings and not override_empty_mappings:
+            raise ValueError("Please run init_standardise first")
+
+    def create_lookup_mappings(self):
+        pass
diff --git a/asset_list/app.py b/asset_list/app.py
new file mode 100644
index 00000000..21b405d8
--- /dev/null
+++ b/asset_list/app.py
@@ -0,0 +1 @@
+import os
diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py
new file mode 100644
index 00000000..1e45bd83
--- /dev/null
+++ b/asset_list/mappings/exising_pv.py
@@ -0,0 +1,8 @@
+STANDARD_EXISTING_PV = {
+    "already has PV", "no PV", "unknown"
+}
+
+EXISTING_PV_MAPPINGS = {
+    "NO": "no PV",
+    "YES": "already has PV",
+}
diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py
new file mode 100644
index 00000000..4fce39ab
--- /dev/null
+++ b/asset_list/mappings/heating_systems.py
@@ -0,0 +1,46 @@
+STANDARD_HEATING_SYSTEMS = {
+    "gas combi boiler",
+    "electric storage heaters",
+    "district heating",
+    "gas condensing boiler",
+    "oil boiler",
+    "gas condensing combi",
+    "air source heat pump",
+    "boiler - other fuel",
+    "ground source heat pump",
+    "electric radiators",
+    "other",
+    "electric boiler",
+    "unknown",
+    "communal gas boiler",
+}
+
+HEATING_MAPPINGS = {
+    "Combi - GAS": "gas combi boiler",
+    "E7 Storage Heaters": "electric storage heaters",
+    "District heating system": "district heating",
+    "Condensing Boiler - GAS": "gas condensing boiler",
+    "Boiler Oil/other": "oil boiler",
+    "Condensing Combi - Gas": "gas condensing combi",
+    "Air Source Source Heat Pump": "air source heat pump",
+    "Biomass Boiler": "boiler - other fuel",
+    "Ground Source Heat Pump": "ground source heat pump",
+    "Electric Oil filled radiators": "electric radiators",
+    "Solid Fuel": "other",
+    "LPG Boiler": "boiler - other fuel",
+    "Electric Boiler": "electric boiler",
+    "No data": "unknown",
+    "Boiler Communal/Commercial - GAS": "communal gas boiler",
+    "Eco Electric Radiators": "electric radiators",
+    "Gas fire": "other",
+    "Backboiler - Solid fuel": "other",
+}
+
+# array(['Combi - GAS', 'E7 Storage Heaters', 'District heating system',
+#        'Condensing Boiler - GAS', 'Boiler Oil/other',
+#        'Condensing Combi - Gas', 'Air Source Source Heat Pump',
+#        'Biomass Boiler', 'Ground Source Heat Pump',
+#        'Electric Oil filled radiators', 'Solid Fuel', 'LPG Boiler',
+#        'Electric Boiler', 'No data', 'Boiler Communal/Commercial - GAS',
+#        'Eco Electric Radiators', 'Gas fire', 'Backboiler - Solid fuel'],
+#       dtype=object)
diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py
new file mode 100644
index 00000000..bcad9ede
--- /dev/null
+++ b/asset_list/mappings/property_type.py
@@ -0,0 +1,16 @@
+# These are the standard categories for property types
+STANDARD_PROPERTY_TYPES = {
+    "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house",
+    "unknown", "other"
+}
+
+# This is a basic mapping that we use to map values that we've seen commonly to standard values
+PROPERTY_MAPPING = {
+    "HOUSE": "house",
+    "FLAT": "flat",
+    "MAISONET": "maisonette",
+    "BUNGALOW": "bungalow",
+    "BLKHOUS": "block house",
+    "BEDSIT": "bedsit",
+    "COACHSE": "coach house",
+}
diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py
new file mode 100644
index 00000000..7dec7d12
--- /dev/null
+++ b/asset_list/mappings/walls.py
@@ -0,0 +1,38 @@
+STANDARD_WALL_CONSTRUCTIONS = {
+    "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick",
+    "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob",
+    "new build - average thermal transmittance",
+}
+
+WALL_CONSTRUCTION_MAPPINGS = {
+    "New Build - Average Thermal Transmittance": "new build - average thermal transmittance",
+    'Average thermal transmittance 0.25 W/m?K': 'unknown',
+    'Cavity wall, as built, insulated (assumed)': 'filled cavity',
+    'Average thermal transmittance 0.31 W/m?K': 'unknown',
+    'Cavity wall, as built, no insulation (assumed)': 'uninsulated cavity',
+    'Average thermal transmittance 0.30 W/m?K': 'unknown', 'Average thermal transmittance 0.28 W/m-¦K': 'unknown',
+    'Average thermal transmittance 0.25 W/m-¦K': 'unknown', 'Average thermal transmittance 0.21 W/m-¦K': 'unknown',
+    'Average thermal transmittance 0.20 W/m-¦K': 'unknown', 'Average thermal transmittance 0.29 W/m?K': 'unknown',
+    'Average thermal transmittance 0.16 W/m?K': 'unknown',
+    'Average thermal transmittance 0.27 W/m&#0178;K': 'unknown',
+    'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown',
+    'Average thermal transmittance 0.18 W/m?K': 'unknown',
+    'Granite or whin, with internal insulation': 'granite or whinstone',
+    'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown',
+    'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown',
+    'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown',
+    'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown',
+    'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone',
+    'Average thermal transmittance 0.33 W/m?K': 'unknown', 'Cavity wall,': 'unknown',
+    'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity',
+    'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown',
+    'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown',
+    'Average thermal transmittance 0.22 W/m?K': 'unknown', 'Average thermal transmittance 0.38 W/m?K': 'unknown',
+    'Average thermal transmittance 0.26 W/m?K': 'unknown', 'Average thermal transmittance 0.27 W/m-¦K': 'unknown',
+    'Average thermal transmittance 0.18 W/m-¦K': 'unknown', 'Average thermal transmittance = 0.27 W/m?K': 'unknown',
+    'Cavity wall, with external insulation': 'filled cavity', 'Average thermal transmittance 0.21 W/m?K': 'unknown',
+    'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown',
+    'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown',
+    'Cavity wall, with internal insulation': 'filled cavity',
+    'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown'
+}
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
index d6d64471..0c16c43a 100644
--- a/asset_list/requirements.txt
+++ b/asset_list/requirements.txt
@@ -5,4 +5,6 @@ pydantic-settings==2.6.0
 epc-api-python==1.0.2
 fuzzywuzzy
 boto3
-openpyxl
\ No newline at end of file
+openpyxl
+openai
+tiktoken
\ No newline at end of file
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index fcf11765..ca5195d6 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -364,10 +364,11 @@ def app():
         landlord_heating_system="Heat Source",
         landlord_existing_pv="PV (Y/N)"
     )
-    self.standardised_asset_list(
-        # In here, we might want to pass some specific remaps
+    self.init_standardise(
     )
 
+    self.apply_transformations()
+
     # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
     # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
     # SHEET_NAME = "Sheet1"

From 978deb286bc411a563631e81685319a38ef9061e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 22:32:05 +0000
Subject: [PATCH 41/72] debugging remapper

---
 asset_list/AssetList.py                | 19 ++++++++++----
 asset_list/mappings/exising_pv.py      |  4 +++
 asset_list/mappings/heating_systems.py | 17 ++++++-------
 asset_list/mappings/property_type.py   |  2 ++
 asset_list/mappings/walls.py           | 34 +++++++++++++++++++++++++-
 etl/route_march_data_pull/app.py       |  5 ++--
 6 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index e61cc89b..8f905a33 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -27,8 +27,8 @@ class DataRemapper:
         :param standard_values: Set of allowed standardized values.
         :param standard_map: Dictionary of common remappings {raw_value: standard_value}.
         """
-        self.standard_values = {v.lower() for v in standard_values}  # Normalize to lowercase
-        self.standard_map = {k.lower(): v.lower() for k, v in (standard_map or {}).items()}  # Predefined mappings
+        self.standard_values = standard_values
+        self.standard_map = standard_map
         self.fuzzy_threshold = 90  # Adjust fuzzy matching sensitivity
         self.ai_model = "gpt-4-turbo"  # Use gpt-3.5-turbo for cheaper processing
 
@@ -39,7 +39,7 @@ class DataRemapper:
         self.total_tokens_used = 0
         self.total_cost = 0
         self.remap_dict = {}  # {original_value: standardized_value}
-        self.max_tokens = 1000  # Limit for OpenAI API
+        self.max_tokens = max_tokens  # Limit for OpenAI API
 
         # Memoization for AI calls
         self.ai_cache = {}  # {tuple(unmapped_values): {original_value: standardized_value}}
@@ -61,6 +61,8 @@ class DataRemapper:
             return None
         text = text.strip().lower()
         text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
+        # Replace double strings
+        text = re.sub(r'\s+', ' ', text)
         return text
 
     def fuzzy_match(self, text):
@@ -106,6 +108,7 @@ class DataRemapper:
         if input_tokens > self.max_tokens:
             raise ValueError("Input tokens exceed the maximum limit.")
 
+        logger.info("Calling OpenAI API for standardization...")
         response = self.openai_client.chat.completions.create(
             model=self.ai_model,
             messages=[{"role": "user", "content": prompt}],
@@ -156,8 +159,14 @@ class DataRemapper:
             cleaned_value = self.clean_string(value)
 
             # Rule-Based Check (Predefined Mapping)
-            if cleaned_value in self.standard_map:
-                self.remap_dict[value] = self.standard_map[cleaned_value]
+            if cleaned_value in self.standard_map or value in self.standard_map:
+                self.remap_dict[value] = (
+                    self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value]
+                )
+                continue
+
+            if value.lower() in self.standard_map:
+                self.remap_dict[value] = self.standard_map[value.lower()]
                 continue
 
             # Exact Match in Standard Values
diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py
index 1e45bd83..06e77bba 100644
--- a/asset_list/mappings/exising_pv.py
+++ b/asset_list/mappings/exising_pv.py
@@ -5,4 +5,8 @@ STANDARD_EXISTING_PV = {
 EXISTING_PV_MAPPINGS = {
     "NO": "no PV",
     "YES": "already has PV",
+    "no": "no PV",
+    "yes": "already has PV",
+    True: "already has PV",
+    False: "no PV",
 }
diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py
index 4fce39ab..2fbdff70 100644
--- a/asset_list/mappings/heating_systems.py
+++ b/asset_list/mappings/heating_systems.py
@@ -34,13 +34,12 @@ HEATING_MAPPINGS = {
     "Eco Electric Radiators": "electric radiators",
     "Gas fire": "other",
     "Backboiler - Solid fuel": "other",
+    'combi - gas': 'gas combi boiler', 'e7 storage heaters': 'electric storage heaters',
+    'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler',
+    'boiler oil/other': 'oil boiler', 'condensing combi - gas': 'gas condensing combi',
+    'air source source heat pump': 'air source heat pump', 'biomass boiler': 'boiler - other fuel',
+    'ground source heat pump': 'ground source heat pump', 'electric oil filled radiators': 'electric radiators',
+    'solid fuel': 'other', 'lpg boiler': 'boiler - other fuel', 'electric boiler': 'electric boiler',
+    'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler',
+    'eco electric radiators': 'electric radiators', 'gas fire': 'other', 'backboiler - solid fuel': 'other',
 }
-
-# array(['Combi - GAS', 'E7 Storage Heaters', 'District heating system',
-#        'Condensing Boiler - GAS', 'Boiler Oil/other',
-#        'Condensing Combi - Gas', 'Air Source Source Heat Pump',
-#        'Biomass Boiler', 'Ground Source Heat Pump',
-#        'Electric Oil filled radiators', 'Solid Fuel', 'LPG Boiler',
-#        'Electric Boiler', 'No data', 'Boiler Communal/Commercial - GAS',
-#        'Eco Electric Radiators', 'Gas fire', 'Backboiler - Solid fuel'],
-#       dtype=object)
diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py
index bcad9ede..ec569123 100644
--- a/asset_list/mappings/property_type.py
+++ b/asset_list/mappings/property_type.py
@@ -11,6 +11,8 @@ PROPERTY_MAPPING = {
     "MAISONET": "maisonette",
     "BUNGALOW": "bungalow",
     "BLKHOUS": "block house",
+    "blkhous": "block house",
     "BEDSIT": "bedsit",
     "COACHSE": "coach house",
+    "coachse": "coach house",
 }
diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py
index 7dec7d12..33db1fef 100644
--- a/asset_list/mappings/walls.py
+++ b/asset_list/mappings/walls.py
@@ -1,3 +1,5 @@
+from asset_list.AssetList import DataRemapper
+
 STANDARD_WALL_CONSTRUCTIONS = {
     "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick",
     "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob",
@@ -18,6 +20,7 @@ WALL_CONSTRUCTION_MAPPINGS = {
     'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown',
     'Average thermal transmittance 0.18 W/m?K': 'unknown',
     'Granite or whin, with internal insulation': 'granite or whinstone',
+    "Granite or whinstone, as built, insulated (assumed)": "granite or whinstone",
     'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown',
     'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown',
     'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown',
@@ -34,5 +37,34 @@ WALL_CONSTRUCTION_MAPPINGS = {
     'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown',
     'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown',
     'Cavity wall, with internal insulation': 'filled cavity',
-    'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown'
+    'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown',
+    'new build - average thermal transmittance': 'new build - average thermal transmittance',
+    'average thermal transmittance 0.25 w/m?k': 'unknown',
+    'cavity wall, as built, insulated (assumed)': 'filled cavity',
+    'average thermal transmittance 0.31 w/m?k': 'unknown',
+    'cavity wall, as built, no insulation (assumed)': 'uninsulated cavity',
+    'average thermal transmittance 0.30 w/m?k': 'unknown', 'average thermal transmittance 0.28 w/m-¦k': 'unknown',
+    'average thermal transmittance 0.25 w/m-¦k': 'unknown', 'average thermal transmittance 0.21 w/m-¦k': 'unknown',
+    'average thermal transmittance 0.20 w/m-¦k': 'unknown', 'average thermal transmittance 0.29 w/m?k': 'unknown',
+    'average thermal transmittance 0.16 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m&#0178;k': 'unknown',
+    'average thermal transmittance 0.15 w/m-¦k': 'unknown', 'average thermal transmittance 0.23 w/m-¦k': 'unknown',
+    'average thermal transmittance 0.18 w/m?k': 'unknown',
+    'granite or whin, with internal insulation': 'granite or whinstone',
+    'average thermal transmittance 0.22 w/m-¦k': 'unknown', 'average thermal transmittance 0.24 w/m?k': 'unknown',
+    'average thermal transmittance 0.16 w/m-¦k': 'unknown', 'average thermal transmittance 0.35 w/m?k': 'unknown',
+    'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown',
+    'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown',
+    'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone',
+    'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': 'unknown',
+    'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity',
+    'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown',
+    'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown',
+    'average thermal transmittance 0.22 w/m?k': 'unknown', 'average thermal transmittance 0.38 w/m?k': 'unknown',
+    'average thermal transmittance 0.26 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m-¦k': 'unknown',
+    'average thermal transmittance 0.18 w/m-¦k': 'unknown', 'average thermal transmittance = 0.27 w/m?k': 'unknown',
+    'cavity wall, with external insulation': 'filled cavity', 'average thermal transmittance 0.21 w/m?k': 'unknown',
+    'average thermal transmittance 0.23 w/m?k': 'unknown', 'average thermal transmittance 0.20 w/m?k': 'unknown',
+    'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown',
+    'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown',
+    'average thermal transmittance 0.28 w/m?k': 'unknown',
 }
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index ca5195d6..1289fb09 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -346,7 +346,7 @@ def app():
 
     invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
 
-    self = AssetList(
+    asset_list = AssetList(
         local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
         header=0,
         sheet_name=SHEET_NAME,
@@ -364,8 +364,7 @@ def app():
         landlord_heating_system="Heat Source",
         landlord_existing_pv="PV (Y/N)"
     )
-    self.init_standardise(
-    )
+    asset_list.init_standardise()
 
     self.apply_transformations()
 

From 776285dd1592e037f9345a4396d83db671dedd03 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 22:35:21 +0000
Subject: [PATCH 42/72] added map printing

---
 asset_list/AssetList.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 8f905a33..87402924 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -1,8 +1,9 @@
 import os
 import re
+import tiktoken
+from pprint import pprint
 from datetime import datetime
 from openai import OpenAI
-import tiktoken
 import numpy as np
 import pandas as pd
 from fuzzywuzzy import process
@@ -491,6 +492,12 @@ class AssetList:
 
         # We now print out the variable mappings, which can be reviewed by the user, before the final standardised
         # asset list is returned
+        for variable, mapping in self.variable_mappings.items():
+            pprint(f"Variable: {variable}")
+            pprint(mapping)
+            # Print a space
+            print("\n")
+            pprint("=======================================")
 
     def apply_standardiation(self, override_empty_mappings=False):
         """

From 75e7c13a29ed98059a99e54245b72cebd9c52f48 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 19 Feb 2025 22:51:48 +0000
Subject: [PATCH 43/72] modifying creation of ids

---
 asset_list/AssetList.py          | 37 ++++++++++++++++++++++++++++----
 etl/route_march_data_pull/app.py | 13 +++--------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 87402924..b153b624 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -1,3 +1,4 @@
+import hashlib
 import os
 import re
 import tiktoken
@@ -324,11 +325,24 @@ class AssetList:
         We want all figures to be positive
         :return:
         """
-        import sys
+
+        # We'll remove punctuation and whitespace from the address, before hashing to produce an ID
+
+        def _make_hash(value):
+            """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value."""
+            # Normalize and remove special characters for cleaner ID
+            cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower()
+
+            # Generate SHA-256 hash and truncate it
+            short_hash = hashlib.sha256(value.encode()).hexdigest()[:12]
+
+            return f"{cleaned_value}-{short_hash}"
+
+        # Apply transformation
         self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
-            self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[
-            self.postcode_colname]
-        ).apply(lambda x: hash(x) % 2 ** sys.hash_info.width)
+            self.standardised_asset_list[self.full_address_colname] +
+            self.standardised_asset_list[self.postcode_colname]
+        ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash)
 
     @staticmethod
     def _strip_postcode_from_full_address(full_address, postcode):
@@ -509,5 +523,20 @@ class AssetList:
         if not self.variable_mappings and not override_empty_mappings:
             raise ValueError("Please run init_standardise first")
 
+        logger.info("Applying standardisation to asset list")
+
+        for variable, mapping in self.variable_mappings.items():
+            self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
+
+        if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
+            # Drop the dupes
+            pprint(
+                f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated "
+                f"addresses - dropping"
+            )
+            self.standardised_asset_list = self.standardised_asset_list[
+                ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
+            ]
+
     def create_lookup_mappings(self):
         pass
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 1289fb09..54ae2280 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -344,7 +344,8 @@ def app():
     HAS_NON_INTRUSIVES = True
     PROPERTY_TYPE_COLUMN = "Location type"  # This will be used to identify and remove bedsits
 
-    invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
+    # Maps addresses to uprn in problematic cases
+    MANUAL_UPRN_MAP = {}
 
     asset_list = AssetList(
         local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
@@ -366,7 +367,7 @@ def app():
     )
     asset_list.init_standardise()
 
-    self.apply_transformations()
+    asset_list.apply_standardiation()
 
     # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
     # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
@@ -382,9 +383,6 @@ def app():
     # # If we have the non-intrusives data, this should be true
     # HAS_NON_INTRUSIVES = True
 
-    # Maps addresses to uprn in problematic cases
-    MANUAL_UPRN_MAP = {}
-
     asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
 
     if MISSING_POSTCODES_METHOD is not None:
@@ -464,11 +462,6 @@ def app():
 
     # We check for duplicated addresses
     asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
-    if asset_list["deduper"].duplicated().sum():
-        # Drop the dupes
-        print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
-        asset_list = asset_list[~asset_list["deduper"].duplicated()]
-    asset_list = asset_list.drop(columns=["deduper"])
 
     # We chunk up this data into 5000 rows at a time
     # Create the chunks directory

From fe6de36782bc3d413f7813ee54ad151e11bc929d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 07:46:52 +0000
Subject: [PATCH 44/72] creating new maps

---
 etl/route_march_data_pull/app.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 54ae2280..d520895d 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -6,6 +6,10 @@ import numpy as np
 from tqdm import tqdm
 from datetime import datetime
 from asset_list.AssetList import AssetList
+from asset_list.mappings.property_type import PROPERTY_MAPPING
+from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
+from asset_list.mappings.heating_systems import HEATING_MAPPINGS
+from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS
 
 from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
@@ -367,6 +371,21 @@ def app():
     )
     asset_list.init_standardise()
 
+    # We produce the new maps, which can be saved for future useage
+
+    new_property_type_map = PROPERTY_MAPPING.copy().update(
+        asset_list.variable_mappings[asset_list.landlord_property_type]
+    )
+    new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update(
+        asset_list.variable_mappings[asset_list.landlord_wall_construction]
+    )
+    new_heating_map = HEATING_MAPPINGS.copy().update(
+        asset_list.variable_mappings[asset_list.landlord_heating_system]
+    )
+    new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update(
+        asset_list.variable_mappings[asset_list.landlord_existing_pv]
+    )
+
     asset_list.apply_standardiation()
 
     # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"

From 63dbda005d63d590b1d2e1b156d15d125a67c746 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 07:57:47 +0000
Subject: [PATCH 45/72] completing full rename

---
 asset_list/AssetList.py | 51 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index b153b624..8379cc2a 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -218,8 +218,9 @@ class AssetList:
     STANDARD_ADDRESS_1 = "domna_address_1"
     STANDARD_POSTCODE = "domna_postcode"
     STANDARD_FULL_ADDRESS = "domna_full_address"
-    STANDARD_YEAR_BUILT = "domna_year_built"
+    STANDARD_YEAR_BUILT = "landlord_year_built"
     STANDARD_UPRN = "ordnance_survey_uprn"
+    STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id"
     STANDARD_PROPERTY_TYPE = "landlord_property_type"
     STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
     STANDARD_HEATING_SYSTEM = "landlord_heating_system"
@@ -293,6 +294,8 @@ class AssetList:
 
         self.variable_mappings = {}
 
+        self.rename_map = {}
+
     def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
 
         if method not in self.ADDRESS_1_CLEANING_METHODS:
@@ -359,6 +362,25 @@ class AssetList:
             # We look for string in the form (x-y)
             return bool(cls.MULTI_UNIT_REGEX.search(address1_section))
 
+    @staticmethod
+    def _convert_uprn(x):
+        """
+        Used to convert UPRNS to integer strings
+        :param x: uprn to convert
+        :return: converted uprn
+        """
+
+        if pd.isnull(x):
+            return x
+
+        # check if numeric
+        if np.isreal(x):
+            return str(int(x))
+
+        if str(x).isdigit():
+            return str(int(x))
+        return x
+
     def init_standardise(self):
         """
         This function is used to standardise the asset list
@@ -411,6 +433,12 @@ class AssetList:
         # We create the domna property id
         self.create_property_id()
 
+        # Clean up the UPRN column, if the landlord has provided them
+        if self.landlord_uprn is not None:
+            self.standardised_asset_list[self.landlord_uprn] = (
+                self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn)
+            )
+
         # We keep just the columns we care about and will work through the various columns and standardise
         variables = [
             self.landlord_property_id,
@@ -425,7 +453,21 @@ class AssetList:
             self.landlord_heating_system,
             self.landlord_existing_pv
         ]
-        rename = {}
+        # Keep just non-null variables (e.g landlord may not provide uprn
+        variables = [v for v in variables if v is not None]
+        rename = {
+            self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID,
+            self.address1_colname: self.STANDARD_ADDRESS_1,
+            self.postcode_colname: self.STANDARD_POSTCODE,
+            self.full_address_colname: self.STANDARD_FULL_ADDRESS,
+            self.landlord_uprn: self.STANDARD_UPRN,
+            self.landlord_property_type: self.STANDARD_PROPERTY_TYPE,
+            self.landlord_year_built: self.STANDARD_YEAR_BUILT,
+            self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION,
+            self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM,
+            self.landlord_existing_pv: self.STANDARD_EXISTING_PV
+        }
+        rename = {k: v for k, v in rename.items() if k is not None}
 
         if self.non_intrusives_present:
             variables += self.NON_INTRUSIVES_COLNAMES
@@ -538,5 +580,10 @@ class AssetList:
                 ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
             ]
 
+        # Apply renames to our standard names
+        self.standardised_asset_list = self.standardised_asset_list.rename(
+            columns=self.rename_map
+        )
+
     def create_lookup_mappings(self):
         pass

From 47ad0e8275ce218b0cd44de6342ff619d83a0d81 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 08:21:59 +0000
Subject: [PATCH 46/72] refactoring get_data methodology

---
 asset_list/AssetList.py          |  23 +++--
 etl/route_march_data_pull/app.py | 149 +++++++++----------------------
 2 files changed, 53 insertions(+), 119 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 8379cc2a..14dce093 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -295,6 +295,7 @@ class AssetList:
         self.variable_mappings = {}
 
         self.rename_map = {}
+        self.keep_variables = []
 
     def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
 
@@ -454,8 +455,8 @@ class AssetList:
             self.landlord_existing_pv
         ]
         # Keep just non-null variables (e.g landlord may not provide uprn
-        variables = [v for v in variables if v is not None]
-        rename = {
+        self.keep_variables = [v for v in variables if v is not None]
+        self.rename_map = {
             self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID,
             self.address1_colname: self.STANDARD_ADDRESS_1,
             self.postcode_colname: self.STANDARD_POSTCODE,
@@ -467,21 +468,17 @@ class AssetList:
             self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM,
             self.landlord_existing_pv: self.STANDARD_EXISTING_PV
         }
-        rename = {k: v for k, v in rename.items() if k is not None}
+        self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None}
 
         if self.non_intrusives_present:
-            variables += self.NON_INTRUSIVES_COLNAMES
-            rename = {
-                **rename,
+            self.keep_variables += self.NON_INTRUSIVES_COLNAMES
+            self.rename_map = {
+                **self.rename_map,
                 **dict(
                     zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES])
                 )
             }
 
-        self.standardised_asset_list = self.standardised_asset_list[variables].rename(
-            columns=rename
-        )
-
         # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y)
         self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[
             self.full_address_colname
@@ -498,10 +495,9 @@ class AssetList:
         )
 
         # Clear our build year column
-
         # We attempt to process the year built column
         if self.landlord_year_built is not None:
-            # We check if we have a datetime
+            # We check if we have a datetime - year built has not been renamed
             if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime):
                 # We treat any string columns - with common values we see
                 self.standardised_asset_list[self.landlord_year_built] = (
@@ -581,7 +577,8 @@ class AssetList:
             ]
 
         # Apply renames to our standard names
-        self.standardised_asset_list = self.standardised_asset_list.rename(
+        # Perform final variable selection and renaming:
+        self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename(
             columns=self.rename_map
         )
 
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index d520895d..83e5e0ca 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -1,10 +1,10 @@
 import os
 import time
-from BaseUtility import Definitions
+import json
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
-from datetime import datetime
+from BaseUtility import Definitions
 from asset_list.AssetList import AssetList
 from asset_list.mappings.property_type import PROPERTY_MAPPING
 from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
@@ -31,8 +31,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
 def get_data(
-    asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None,
-    epc_api_only=False
+    asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map,
+    uprn_column=None, epc_api_only=False, row_id_name="row_id"
 ):
     epc_data = []
     errors = []
@@ -103,12 +103,12 @@ def get_data(
                 searcher.find_property(skip_os=True)
 
             if searcher.newest_epc is None:
-                no_epc.append(home["row_id"])
+                no_epc.append(home[row_id_name])
                 continue
 
             if epc_api_only:
                 epc = {
-                    "row_id": home["row_id"],
+                    row_id_name: home[row_id_name],
                     **searcher.newest_epc.copy()
                 }
 
@@ -144,7 +144,7 @@ def get_data(
             time.sleep(np.random.uniform(0.1, 1))
 
             epc = {
-                "row_id": home["row_id"],
+                row_id_name: home[row_id_name],
                 **searcher.newest_epc.copy(),
                 "recommendations": property_recommendations["rows"],
                 "find_my_epc_data": find_epc_data,
@@ -152,7 +152,7 @@ def get_data(
 
             epc_data.append(epc)
         except Exception as e:
-            errors.append(home["row_id"])
+            errors.append(home[row_id_name])
             time.sleep(5)
 
     return epc_data, errors, no_epc
@@ -402,113 +402,48 @@ def app():
     # # If we have the non-intrusives data, this should be true
     # HAS_NON_INTRUSIVES = True
 
-    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
-
-    if MISSING_POSTCODES_METHOD is not None:
-        if MISSING_POSTCODES_METHOD == "last_two_words":
-            # Replace any double spaces
-            asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('  ', ' ', regex=False)
-            asset_list["Postcode"] = np.where(
-                pd.isnull(asset_list["Postcode"]),
-                asset_list[FULLADDRESS_COLUMN].str.split(" ").str[-2:].str.join(" "),
-                asset_list["Postcode"]
-            )
-        else:
-            raise ValueError(f"Method {MISSING_POSTCODES_METHOD} not recognized")
-
-    asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
-    asset_list["row_id"] = asset_list.index
-
-    # We clean up portential non-breaking spaces, and double spaces
-    for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
-        asset_list[col] = asset_list[col].astype(str)
-        asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
-        asset_list[col] = asset_list[col].str.replace('  ', ' ', regex=False)
-        asset_list[col] = asset_list[col].str.strip()
-
-    if ADDRESS1_COLUMN is None:
-        ADDRESS1_COLUMN = "address1_extracted"
-        asset_list = extract_address1(
-            asset_list=asset_list,
-            full_address_col=FULLADDRESS_COLUMN,
-            postcode_col=POSTCODE_COLUMN,
-            method=ADDRESS1_METHOD
-        )
-
-    if FULLADDRESS_COLUMN is None:
-        FULLADDRESS_COLUMN = "fulladdress_extracted"
-        # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
-        # Sometimes, some of the columns are empty, so we need to remove them
-        asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(
-            lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1
-        )
-
-        # We clean up portential non-breaking spaces, and double spaces
-        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str)
-        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False)
-        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('  ', ' ', regex=False)
-
-    if UPRN_COLUMN is not None:
-        # Check if it's numeric and if so, make sure it's an integer
-        def convert_uprn(x):
-
-            if pd.isnull(x):
-                return x
-
-            # check if numeric
-            if np.isreal(x):
-                return str(int(x))
-
-            if str(x).isdigit():
-                return str(int(x))
-            return x
-
-        asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn)
-
-    # We attempt to process the year built column
-    if PROPERTY_YEAR_BUILT is not None:
-        # We check if we have a datetime
-        if isinstance(asset_list[PROPERTY_YEAR_BUILT].iloc[0], datetime):
-            # We treat any string columns - with common values we see
-            datetime_remap = {
-                "Pre 1900": datetime(year=1899, month=12, day=31),
-            }
-            asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].replace(datetime_remap)
-
-            asset_list[PROPERTY_YEAR_BUILT] = pd.to_datetime(asset_list[PROPERTY_YEAR_BUILT])
-            # Convert this to year
-            asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].dt.year
-
-    # We check for duplicated addresses
-    asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
+    ### We retrieve the EPC data
 
     # We chunk up this data into 5000 rows at a time
     # Create the chunks directory
-    if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")):
-        os.makedirs(os.path.join(DATA_FOLDER, "Chunks"))
-    chunk_size = 5000
-    errors = []
-    no_epc = []
+    force_retrieve_data = False
     skip = None  # Used to skip already completed chunks
-    for i in range(0, len(asset_list), chunk_size):
+    chunk_size = 5000
+    filename = "Chunk {i}.csv"
+    download_folder = os.path.join(DATA_FOLDER, "Chunks")
+    if not os.path.exists(download_folder):
+        os.makedirs(download_folder)
+
+    chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size))
+    downloaded_files = {filename.format(i=i) for i in chunk_indexes}
+
+    # We check if we have files associated to these files already and if we do, and we do not want to force the
+    # fetching of the data, we skip
+    folder_contents = os.listdir(download_folder)
+    if all(x in folder_contents for x in downloaded_files):
+        skip = max(chunk_indexes)
+
+    for i in range(0, len(asset_list.standardised_asset_list), chunk_size):
         print(f"Processing chunk {i} to {i + chunk_size}")
-        if skip is not None:
+        if skip is not None and not force_retrieve_data:
             if i <= skip:
                 continue
-        chunk = asset_list[i:i + chunk_size]
+        chunk = asset_list.standardised_asset_list[i:i + chunk_size]
         epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
             asset_list=chunk,
-            fulladdress_column=FULLADDRESS_COLUMN,
-            address1_column=ADDRESS1_COLUMN,
-            postcode_column=POSTCODE_COLUMN,
+            row_id_name=asset_list.DOMNA_PROPERTY_ID,
+            fulladdress_column=asset_list.STANDARD_FULL_ADDRESS,
+            address1_column=asset_list.STANDARD_ADDRESS_1,
+            postcode_column=asset_list.STANDARD_POSTCODE,
             manual_uprn_map=MANUAL_UPRN_MAP,
-            uprn_column=UPRN_COLUMN
+            uprn_column=asset_list.STANDARD_UPRN
         )
 
         # We now retrieve any failed properties
-        chunk_failed = chunk[chunk["row_id"].isin(errors)]
+        chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)]
         epc_data_failed, _, _ = get_data(
             asset_list=chunk_failed,
+            row_id_name=asset_list.DOMNA_PROPERTY_ID,
             fulladdress_column=FULLADDRESS_COLUMN,
             address1_column=ADDRESS1_COLUMN,
             postcode_column=POSTCODE_COLUMN,
@@ -517,20 +452,22 @@ def app():
         )
 
         epc_data_chunk.extend(epc_data_failed)
-        errors.extend(errors_chunk)
-        no_epc.extend(no_epc_chunk)
 
         # Append the failed data to the main data
         # Store the chunk locally as a csv
         pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)
+        # Store the errors and no-data locally
+        with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f:
+            json.dump(errors_chunk, f)
+
+        with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f:
+            json.dump(no_epc_chunk, f)
 
     # We read in and concatenate the created created chunks
-    chunks_folder = os.path.join(DATA_FOLDER, "Chunks")
     # List the contents
-    chunk_files = os.listdir(chunks_folder)
     epc_data = []
-    for file in chunk_files:
-        csv_data = pd.read_csv(os.path.join(chunks_folder, file))
+    for file in downloaded_files:
+        csv_data = pd.read_csv(os.path.join(download_folder, file))
         # We need to convert the recommendations back to a list
         csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
         csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)

From 591ce5445839780ea64db5376eb0457d27da3d34 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 08:26:09 +0000
Subject: [PATCH 47/72] hndling case where landlord uprn and landlord property
 id are the sames

---
 asset_list/AssetList.py          | 6 ++++++
 etl/route_march_data_pull/app.py | 9 ++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 14dce093..5e8ff29c 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -297,6 +297,12 @@ class AssetList:
         self.rename_map = {}
         self.keep_variables = []
 
+        # Finally, we handle the case where the landlord's property ID is actually the OS UPRN
+        if self.landlord_uprn == self.landlord_property_id:
+            self.raw_asset_list[self.STANDARD_UPRN] = self.raw_asset_list[self.landlord_uprn].copy()
+            # Update the reference to landlord UPRn
+            self.landlord_uprn = self.STANDARD_UPRN
+
     def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
 
         if method not in self.ADDRESS_1_CLEANING_METHODS:
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 83e5e0ca..4bf9fe3a 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -511,6 +511,7 @@ def app():
         find_my_epc_data["Solar photovoltaics"] = False
 
     # Retrieve just the data we need
+
     epc_df = epc_df[
         [
             "row_id",
@@ -527,21 +528,23 @@ def app():
             "walls-description",
             "floor-description",
             "transaction-type",
-            # New fields needed
             "secondheat-description",
             "total-floor-area",
             "construction-age-band",
             "floor-height",
             "number-habitable-rooms",
             "mainheat-description",
-            #
-            "energy-consumption-current",  # kwh/m2
+            'mainheatcont-description',
+            "energy-consumption-current",
             "photo-supply",
         ]
     ].rename(
         columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}
     )
 
+    asset_list.merge_data(epc_df)
+    asset_list.insert_
+
     asset_list = asset_list.merge(
         epc_df,
         how="left",

From 4a6802a5a24715ca0f047a70b680d6dc484cd7b4 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 08:27:35 +0000
Subject: [PATCH 48/72] fixed bug to reference standardised data when copying
 uprn instead of raw

---
 asset_list/AssetList.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 5e8ff29c..86b1bf87 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -299,7 +299,7 @@ class AssetList:
 
         # Finally, we handle the case where the landlord's property ID is actually the OS UPRN
         if self.landlord_uprn == self.landlord_property_id:
-            self.raw_asset_list[self.STANDARD_UPRN] = self.raw_asset_list[self.landlord_uprn].copy()
+            self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy()
             # Update the reference to landlord UPRn
             self.landlord_uprn = self.STANDARD_UPRN
 

From 37cc43adb1b331d267c724faaf804afaa0b7f2fc Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 08:39:29 +0000
Subject: [PATCH 49/72] refactoring creation of epc dataset

---
 asset_list/AssetList.py          | 42 +++++++++++++++++
 etl/route_march_data_pull/app.py | 77 +++++++-------------------------
 2 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 86b1bf87..88425e6d 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -202,6 +202,33 @@ class AssetList:
     This class is used to standardise asset lists so that we can process the core information in a consistent manner.
     """
 
+    EPC_API_DATA_NAMES = {
+        "uprn": "epc_os_uprn",
+        "address1": "epc_address1",
+        "address": "epc_address",
+        "postcode": "epc_postcode",
+        "inspection-date": "epc_inspection_date",
+        "current-energy-efficiency": "epc_sap_score_on_register",
+        "current-energy-rating": "epc_rating_on_register",
+        "property-type": "epc_property_type",
+        "built-form": "epc_archetype",
+        "total-floor-area": "epc_total_floor_area",
+        "construction-age-band": "epc_age_band",
+        "floor-height": "epc_floor_height",
+        "number-habitable-rooms": "epc_number_habitable_rooms",
+        "walls-description": "epc_wall_construction",
+        "roof-description": "epc_roof_construction",
+        "floor-description": "epc_floor_construction",
+        "mainheat-description": "epc_heating_type",
+        'mainheatcont-description': "epc_heating_controls",
+        "secondheat-description": "epc_secondary_heating",
+        "transaction-type": "epc_reason",
+        "energy-consumption-current": "epc_heat_demand",
+    }
+    FIND_EPC_DATA_NAMES = {
+
+    }
+
     DATETIME_REMAP = {
         "Pre 1900": datetime(year=1899, month=12, day=31),
     }
@@ -590,3 +617,18 @@ class AssetList:
 
     def create_lookup_mappings(self):
         pass
+
+    def merge_data(self, df: pd.DataFrame):
+        """
+        Used to insert data into the standardised asset list, based on the domna property id
+        :return:
+        """
+        if self.DOMNA_PROPERTY_ID not in df.columns:
+            raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}")
+
+        if df[self.DOMNA_PROPERTY_ID].duplicated().sum():
+            raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs")
+
+        self.standardised_asset_list = self.standardised_asset_list.merge(
+            df, how="left", on=self.DOMNA_PROPERTY_ID
+        )
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 4bf9fe3a..2e66c4aa 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -474,20 +474,22 @@ def app():
         epc_data.append(csv_data)
 
     epc_df = pd.concat(epc_data)
+    # TODO: TEMP!!!
+    epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID})
 
     # We expand out the recommendations
-    recommendations_df = epc_df[["row_id", "recommendations"]]
+    recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
 
     unique_recommendations = set()
     for _, row in recommendations_df.iterrows():
         unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
 
-    columns = ["row_id"] + list(unique_recommendations)
+    columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations)
     transformed_data = []
     for _, row in recommendations_df.iterrows():
         # Initialize a dictionary for this row with False for all recommendations
         row_data = {col: False for col in columns}
-        row_data["row_id"] = row["row_id"]
+        row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID]
 
         # Set True for each recommendation present in this row
         for rec in row["recommendations"]:
@@ -500,10 +502,11 @@ def app():
     transformed_df = pd.DataFrame(transformed_data)
     # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation
     # recommendations
-    transformed_df = transformed_df[["row_id", "Cavity wall insulation"]]
+    transformed_df = transformed_df[[asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation"]]
 
     # Get the find my epc data
-    find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
+    find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop(
+        columns=["find_my_epc_data"]).join(
         pd.json_normalize(epc_df["find_my_epc_data"])
     )
     # We check if we get the solar pv column:
@@ -513,46 +516,15 @@ def app():
     # Retrieve just the data we need
 
     epc_df = epc_df[
-        [
-            "row_id",
-            "uprn",
-            "address1",
-            "address",
-            "postcode",
-            "property-type",
-            "built-form",
-            "inspection-date",
-            "current-energy-rating",
-            "current-energy-efficiency",
-            "roof-description",
-            "walls-description",
-            "floor-description",
-            "transaction-type",
-            "secondheat-description",
-            "total-floor-area",
-            "construction-age-band",
-            "floor-height",
-            "number-habitable-rooms",
-            "mainheat-description",
-            'mainheatcont-description',
-            "energy-consumption-current",
-            "photo-supply",
-        ]
-    ].rename(
-        columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}
+        [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
+        ].rename(
+        columns=asset_list.EPC_API_DATA_NAMES
     )
 
-    asset_list.merge_data(epc_df)
-    asset_list.insert_
-
-    asset_list = asset_list.merge(
-        epc_df,
-        how="left",
-        on="row_id"
-    ).merge(
+    epc_df = epc_df.merge(
         find_my_epc_data[
             [
-                "row_id", "heating_text", "hot_water_text", 'Assessor’s name',
+                asset_list.DOMNA_PROPERTY_ID, "heating_text", "hot_water_text", 'Assessor’s name',
                 "Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
                 "Assessor’s ID", "Solar photovoltaics"
             ]
@@ -564,31 +536,16 @@ def app():
             }
         ),
         how="left",
-        on="row_id"
+        on=asset_list.DOMNA_PROPERTY_ID
     )
 
+    asset_list.merge_data(epc_df)
+
     asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
     asset_list = asset_list.drop(columns=["photo-supply"])
 
     # Rename the columns
-    asset_list = asset_list.rename(columns={
-        "inspection-date": "Date of last EPC",
-        "current-energy-efficiency": "SAP score on register",
-        "current-energy-rating": "EPC rating on register",
-        "property-type": "Property Type",
-        "built-form": "Archetype - EPC",
-        "total-floor-area": "Property Floor Area",
-        "construction-age-band": "Property Age Band",
-        "floor-height": "Property Floor Height",
-        "number-habitable-rooms": "Number of Habitable Rooms",
-        "walls-description": "Wall Construction",
-        "roof-description": "Roof Construction",
-        "floor-description": "Floor Construction",
-        "mainheat-description": "Heating Type",
-        "secondheat-description": "Secondary Heating",
-        "transaction-type": "Reason for last EPC",
-        "energy-consumption-current": "Heat Demand (kWh/m2)",
-    })
+    asset_list = asset_list
 
     asset_list["Estimated Number of Floors"] = asset_list.apply(
         lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(

From ecc9d9954073858685ef1877d574fc5fc73606b2 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 08:45:15 +0000
Subject: [PATCH 50/72] major refactor of handling of epc data and starting to
 set up extract_attributes

---
 asset_list/AssetList.py          | 23 ++++++++++++++++++-----
 etl/route_march_data_pull/app.py | 17 ++++-------------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 88425e6d..4ca4c2b8 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -226,7 +226,14 @@ class AssetList:
         "energy-consumption-current": "epc_heat_demand",
     }
     FIND_EPC_DATA_NAMES = {
-
+        "heating_text": "epc_estiamted_heating_kwh",
+        "hot_water_text": "epc_estimated_hotwater_kwh",
+        'Assessor’s name': "epc_assessor_name",
+        "Assessor's Telephone": "epc_assessor_telephone",
+        "Assessor's Email": "epc_assessor_email",
+        "Accreditation scheme": "epc_assessor_accreditation",
+        "Assessor’s ID": "epc_assessor_id",
+        "Solar photovoltaics": "epc_solar_pv"
     }
 
     DATETIME_REMAP = {
@@ -265,7 +272,8 @@ class AssetList:
         "Any further surveyor notes", 'Surveyors Name'
     ]
 
-    #### Mapping for wall construction
+    # Attributes - these are columns that we produce, calcualted based on other pieces of data
+    ATTRIBUTE_HAS_SOLAR = "attribute_has_solar"
 
     def __init__(
         self,
@@ -615,9 +623,6 @@ class AssetList:
             columns=self.rename_map
         )
 
-    def create_lookup_mappings(self):
-        pass
-
     def merge_data(self, df: pd.DataFrame):
         """
         Used to insert data into the standardised asset list, based on the domna property id
@@ -632,3 +637,11 @@ class AssetList:
         self.standardised_asset_list = self.standardised_asset_list.merge(
             df, how="left", on=self.DOMNA_PROPERTY_ID
         )
+
+    def extract_attributes(self):
+        # Used to extracty the typical attributes that we use to identify viable work
+
+        self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = (
+            self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] |
+            ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""])
+        )
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 2e66c4aa..8b112ea2 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -522,25 +522,16 @@ def app():
     )
 
     epc_df = epc_df.merge(
-        find_my_epc_data[
-            [
-                asset_list.DOMNA_PROPERTY_ID, "heating_text", "hot_water_text", 'Assessor’s name',
-                "Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
-                "Assessor’s ID", "Solar photovoltaics"
-            ]
-        ].rename(
-            columns={
-                "Solar photovoltaics": "Has Solar PV",
-                "heating_text": "Heating Estimated kWh",
-                "hot_water_text": "Hot Water Estimated kWh",
-            }
-        ),
+        find_my_epc_data[[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.FIND_EPC_DATA_NAMES.keys())]
+        .rename(columns=asset_list.FIND_EPC_DATA_NAMES),
         how="left",
         on=asset_list.DOMNA_PROPERTY_ID
     )
 
     asset_list.merge_data(epc_df)
 
+    asset_list.extract_attributes()
+
     asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
     asset_list = asset_list.drop(columns=["photo-supply"])
 

From ed333e1714fa9ff3a4f09bc789e5aa37bca0bc8e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 09:04:26 +0000
Subject: [PATCH 51/72] refactored est no floors

---
 asset_list/AssetList.py                 | 27 +++++++++++++++++++++++++
 etl/route_march_data_pull/app.py        | 12 +++++------
 recommendations/recommendation_utils.py |  7 +++++--
 3 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 4ca4c2b8..74469c63 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -15,6 +15,12 @@ import asset_list.mappings.walls as walls_mappings
 import asset_list.mappings.heating_systems as heating_mappings
 import asset_list.mappings.exising_pv as existing_pv_mappings
 
+from recommendations.recommendation_utils import (
+    estimate_perimeter,
+    estimate_external_wall_area,
+    estimate_number_of_floors
+)
+
 logger = setup_logger()
 
 # OpenAI API Key (set this in your environment variables for security)
@@ -224,6 +230,7 @@ class AssetList:
         "secondheat-description": "epc_secondary_heating",
         "transaction-type": "epc_reason",
         "energy-consumption-current": "epc_heat_demand",
+        "photo-supply": "epc_photo_supply"
     }
     FIND_EPC_DATA_NAMES = {
         "heating_text": "epc_estiamted_heating_kwh",
@@ -274,6 +281,7 @@ class AssetList:
 
     # Attributes - these are columns that we produce, calcualted based on other pieces of data
     ATTRIBUTE_HAS_SOLAR = "attribute_has_solar"
+    ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors"
 
     def __init__(
         self,
@@ -645,3 +653,22 @@ class AssetList:
             self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] |
             ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""])
         )
+
+        accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"]
+
+        # The logic here is:
+        # 1) Take the property type provided by the HA themselves
+        # 2) In absence of that, take the EPC property type
+        # 3) Otherwise use None
+        self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply(
+            lambda x: estimate_number_of_floors(
+                property_type=(
+                    x[self.STANDARD_PROPERTY_TYPE].title() if
+                    x[self.STANDARD_PROPERTY_TYPE].title() in accepted_epc_property_types else (
+                        x[self.EPC_API_DATA_NAMES["property-type"]] if not
+                        pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None
+                    )
+                )
+            ),
+            axis=1
+        )
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 8b112ea2..9754e726 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -514,7 +514,6 @@ def app():
         find_my_epc_data["Solar photovoltaics"] = False
 
     # Retrieve just the data we need
-
     epc_df = epc_df[
         [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
         ].rename(
@@ -529,15 +528,14 @@ def app():
     )
 
     asset_list.merge_data(epc_df)
+    # TODO: TEMP!!!
+    epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str)
+    asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge(
+        epc_df, how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn"
+    )
 
     asset_list.extract_attributes()
 
-    asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
-    asset_list = asset_list.drop(columns=["photo-supply"])
-
-    # Rename the columns
-    asset_list = asset_list
-
     asset_list["Estimated Number of Floors"] = asset_list.apply(
         lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
             x["Property Type"]) else None, axis=1
diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py
index 00da6107..602684cf 100644
--- a/recommendations/recommendation_utils.py
+++ b/recommendations/recommendation_utils.py
@@ -205,7 +205,7 @@ def get_wall_u_value(
 
         mapped_value = wall_uvalues_df[
             wall_uvalues_df["Wall_type"] == mapped_description
-        ][age_band].values[0]
+            ][age_band].values[0]
 
         if pd.isnull(mapped_value) and "Park home" in mapped_description:
             # We don't know enough in this case so we default to 0
@@ -428,6 +428,9 @@ def estimate_number_of_floors(property_type):
     Using the property type, we estimate the number of floors in the property
     """
 
+    if property_type is None:
+        return None
+
     if property_type == "House":
         number_of_floors = 2
     elif property_type in ["Flat", "Bungalow"]:
@@ -560,7 +563,7 @@ def get_floor_u_value(
         insulation_lookup = s11[
             s11["Age_band"].str.contains(age_band) & s11["Floor_construction"]
             == floor_type
-        ]
+            ]
         if insulation_lookup.empty:
             insulation_thickness = 0
         else:

From 8bf6aa5af23378c0a1a27f6f756f3440d89b6bc4 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 09:20:25 +0000
Subject: [PATCH 52/72] refactoring construction of the attributes

---
 asset_list/AssetList.py | 65 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 74469c63..5f4436b8 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -21,6 +21,8 @@ from recommendations.recommendation_utils import (
     estimate_number_of_floors
 )
 
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
 logger = setup_logger()
 
 # OpenAI API Key (set this in your environment variables for security)
@@ -279,9 +281,19 @@ class AssetList:
         "Any further surveyor notes", 'Surveyors Name'
     ]
 
+    # This SAP threshold is a key search criteria for properties that may be eligible for extraction
+    SAP_RATING_THRESHOLD = 75
+    # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable
+    EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5
+
     # Attributes - these are columns that we produce, calcualted based on other pieces of data
     ATTRIBUTE_HAS_SOLAR = "attribute_has_solar"
     ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors"
+    ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter"
+    ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area"
+    ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness"
+    ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{SAP_RATING_THRESHOLD}_and_below"
+    ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}"
 
     def __init__(
         self,
@@ -672,3 +684,56 @@ class AssetList:
             ),
             axis=1
         )
+
+        self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = (
+            self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float)
+        )
+        # Replace "" value with None
+        self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
+            self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None)
+        )
+        self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
+            self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float)
+        )
+
+        # Estimate the perimeter
+        self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply(
+            lambda x: estimate_perimeter(
+                floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+                num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+            ), axis=1
+        )
+
+        self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply(
+            lambda x: estimate_external_wall_area(
+                num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+                floor_height=(
+                    float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if
+                    x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5
+                ),
+                perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER],
+                built_form=x[self.EPC_API_DATA_NAMES["built-form"]]
+            ),
+            axis=1
+        )
+
+        self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply(
+            lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[
+                "insulation_thickness"] if not pd.isnull(
+                x[self.EPC_API_DATA_NAMES["roof-description"]]) else None,
+            axis=1
+        )
+
+        # We produce some additional fields
+        # 1) Is the SAP rating below C75
+        self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = (
+            self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <=
+            self.SAP_RATING_THRESHOLD
+        )
+        # 2) Flag anything where the EPC is older than 5 years
+
+        self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = (
+            pd.to_datetime(
+                self.standardised_asset_list[self.EPC_API_DATA_NAMES["lodgement-date"]]
+            ).dt.year < self.EPC_YEAR_THRESHOLD
+        )

From c0ebffb6cbab5d4f4e2d24f82f352cb8b7024638 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 20 Feb 2025 20:50:05 +0000
Subject: [PATCH 53/72] coding up logic to identify work types

---
 asset_list/AssetList.py                | 250 ++++++++++++++++++++++++-
 asset_list/mappings/heating_systems.py |   1 +
 asset_list/mappings/walls.py           |  27 ++-
 etl/route_march_data_pull/app.py       | 164 +---------------
 4 files changed, 270 insertions(+), 172 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 5f4436b8..81aa525a 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -10,6 +10,7 @@ import pandas as pd
 from fuzzywuzzy import process
 from utils.logger import setup_logger
 from backend.SearchEpc import SearchEpc
+from BaseUtility import Definitions
 import asset_list.mappings.property_type as property_type_mappings
 import asset_list.mappings.walls as walls_mappings
 import asset_list.mappings.heating_systems as heating_mappings
@@ -282,7 +283,9 @@ class AssetList:
     ]
 
     # This SAP threshold is a key search criteria for properties that may be eligible for extraction
-    SAP_RATING_THRESHOLD = 75
+    FILLED_CAVITY_SAP_THRESHOLD = 75
+    # This SAP the
+    EMPTY_CAVITY_SAP_THRESHOLD = 71
     # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable
     EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5
 
@@ -292,9 +295,17 @@ class AssetList:
     ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter"
     ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area"
     ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness"
-    ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{SAP_RATING_THRESHOLD}_and_below"
+    ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below"
     ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}"
 
+    # These are the descriptions that we look for in the EPC data that are indicative of no insulation
+    EPC_NO_WALL_INSULATION_DESCRIPTIONS = [
+        "cavity wall, as built, no insulation (assumed)",
+        "cavity wall, as built, partial insulation (assumed)",
+        "cavity wall, as built, partial insulation",
+        "cavity wall, as built, no insulation",
+    ]
+
     def __init__(
         self,
         local_filepath,
@@ -728,12 +739,241 @@ class AssetList:
         # 1) Is the SAP rating below C75
         self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = (
             self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <=
-            self.SAP_RATING_THRESHOLD
+            self.FILLED_CAVITY_SAP_THRESHOLD
         )
         # 2) Flag anything where the EPC is older than 5 years
-
         self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = (
             pd.to_datetime(
-                self.standardised_asset_list[self.EPC_API_DATA_NAMES["lodgement-date"]]
+                self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]]
             ).dt.year < self.EPC_YEAR_THRESHOLD
         )
+
+        self.process_age_band()
+
+    def process_age_band(self):
+        processed_age_band = []
+        for _, x in self.standardised_asset_list.iterrows():
+
+            if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or (
+                x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES
+            ):
+                processed_age_band.append(
+                    {
+                        self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+                        "epc_year_lower_bound": None,
+                        "epc_year_upper_bound": None,
+                        "Does Age Match EPC Age Band?": "No EPC Age Band"
+                    }
+                )
+                continue
+
+            # We exatract the upper and lower bounds
+            if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [
+                "England and Wales: 2007 onwards", "England and Wales: 2012 onwards"
+            ]:
+                year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[
+                    "construction-age-band"]] == "England and Wales: 2007 onwards" else 2012
+
+                if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+                    age_band_matches = "No Year Built From Landlord"
+                else:
+                    age_band_matches = (
+                        "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound
+                        else "EPC Age Band is older than Year Built"
+                    )
+
+                processed_age_band.append(
+                    {
+                        self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+                        "epc_year_lower_bound": year_lower_bound,
+                        "epc_year_upper_bound": None,
+                        "Does Age Match EPC Age Band?": age_band_matches
+                    }
+                )
+                continue
+
+            if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900":
+
+                if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+                    age_band_matches = "No Year Built From Landlord"
+                else:
+                    age_band_matches = (
+                        "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900
+                        else "EPC Age Band is newer than Year Built"
+                    )
+
+                processed_age_band.append(
+                    {
+                        self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+                        "epc_year_lower_bound": None,
+                        "epc_year_upper_bound": 1899,
+                        "Does Age Match EPC Age Band?": age_band_matches
+                    }
+                )
+                continue
+
+            if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit():
+
+                if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+                    age_band_matches = "No Year Built From Landlord"
+                else:
+                    age_band_matches = (
+                        "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int(
+                            x[self.EPC_API_DATA_NAMES["construction-age-band"]]
+                        )
+                        else "EPC Age Band is different from Year Built"
+                    )
+
+                processed_age_band.append(
+                    {
+                        self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+                        "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
+                        "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
+                        "Does Age Match EPC Age Band?": age_band_matches
+                    }
+                )
+                continue
+
+            # Oherwise, we extract the upper and lower bounds
+            age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1]
+            lower_date, upper_date = age_band.split("-")
+
+            age_band_matches = (
+                "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and (
+                    x[self.STANDARD_YEAR_BUILT] <= float(upper_date)
+                )
+                else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date)
+                else "EPC Age Band is newer than Year Built"
+            )
+
+            processed_age_band.append(
+                {
+                    self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+                    "epc_year_lower_bound": int(lower_date),
+                    "epc_year_upper_bound": int(upper_date),
+                    "Does Age Match EPC Age Band?": age_band_matches
+                }
+            )
+
+        processed_age_band = pd.DataFrame(processed_age_band)
+
+        self.standardised_asset_list = self.standardised_asset_list.merge(
+            processed_age_band, how="left"
+        )
+
+    def identify_worktypes(self):
+
+        # If we have non-intrusives completed, we can use this to identify work types
+
+        if self.non_intrusives_present:
+            ######################################################
+            # Empty cavity:
+            ######################################################
+            # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled
+            # 2) The age is before 1995
+            # TODO: 3) Remove anything that likley has access issues
+            self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = (
+                (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
+                (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
+                self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) &
+                (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000)
+            )
+
+            self.standardised_asset_list["epc_indicates_empty_cavity"] = (
+                self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin(
+                    self.EPC_NO_WALL_INSULATION_DESCRIPTIONS
+                ) & (
+                    self.standardised_asset_list["epc_year_upper_bound"] <= 1995
+                ) & (
+                    ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD]
+                ) & (
+                    self.standardised_asset_list[
+                        self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD
+                )
+            )
+
+            ######################################################
+            # Extraction
+            ######################################################
+
+            # TODO When filterting like this, 627 properties are flagged as not needing a CIGA check and 582 are flagged
+            # as needing a CIGA check. What is the logic we should be applying here?
+            self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
+                (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
+                (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
+                (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"])
+                 ) & (
+                    self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
+                )
+            )
+
+            ######################################################
+            # Solar
+            ######################################################
+            # Criteria:
+
+            # TODO: Standardise these columns with our cleaned_data object
+
+            # Check 1: Does the property have a valid heating system?
+            self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = (
+                self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin(
+                    ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"]
+                )
+            )
+
+            self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = (
+                (
+                    self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]]
+                    .str.lower().str.contains("air source heat pump|ground source heat pump")
+                ) | (
+                    self.standardised_asset_list[
+                        self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains(
+                        "electric storage heaters"
+                    ) & (
+                        self.standardised_asset_list[self.EPC_API_DATA_NAMES[
+                            "mainheatcont-description"]] == "Controls for high heat retention storage heaters"
+                    )
+                )
+            )
+
+            # Check 2: Does the property have solar already
+            self.standardised_asset_list["property_has_solar"] = (
+                (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") |
+                (self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF") |
+                (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR])
+            )
+
+            # Check 3: Does the property meet the fabric condition
+            # Solar PV installs are subject to the minimum insulation requirements which means:
+            # 1) one of the following insulation measures must be installed as part of the same
+            # ECO4 project:
+            # • roof insulation (flat roof, pitched roof, room-in-roof)
+            # • exterior facing wall insulation (cavity wall, solid wall)
+            # • party cavity wall insulation
+            # • floor insulation (solid and underfloor)
+            #
+            # OR
+            #
+            # all measures (except any exempted measure referred to in paragraph 4.28)
+            # listed in paragraph a) must already be installed
+            #
+            # With this in mind, we look for 2 clases
+            # 1) The property is fully insulated apart from the loft (<200mm insulation)
+            # 2) THe property is fully insulated
+
+            self.standardised_asset_list["solar_landlord_walls_insulated"] = (
+                self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(
+                    ["filled cavity", "insulated solid brick"]
+                )
+            )
+
+            EPC_INSULATED_WALLS_SUBSTRINGS = [
+                ", insulated", "with external insulation", "with internal insulation", "filled cavity"
+            ]
+
+            self.standardised_asset_list["landlord_wall_construction"].value_counts()
+
+            EPC_INSULATED_ROOF_SUBSTRINGS = [
+                "(another dwelling above)", "limited insulation", "(other premises above)",
+                ", no insulation",
+            ]
diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py
index 2fbdff70..89bfe0c4 100644
--- a/asset_list/mappings/heating_systems.py
+++ b/asset_list/mappings/heating_systems.py
@@ -13,6 +13,7 @@ STANDARD_HEATING_SYSTEMS = {
     "electric boiler",
     "unknown",
     "communal gas boiler",
+    "high heat retention storage heaters",
 }
 
 HEATING_MAPPINGS = {
diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py
index 33db1fef..c5cca599 100644
--- a/asset_list/mappings/walls.py
+++ b/asset_list/mappings/walls.py
@@ -1,8 +1,10 @@
 from asset_list.AssetList import DataRemapper
 
 STANDARD_WALL_CONSTRUCTIONS = {
-    "uninsulated cavity", "filled cavity", "partial insulated cavity", "timber frame", "solid brick",
-    "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", "cob",
+    "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation",
+    "timber frame", "uninsulated solid brick",
+    "insulated solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone",
+    "cob",
     "new build - average thermal transmittance",
 }
 
@@ -26,7 +28,8 @@ WALL_CONSTRUCTION_MAPPINGS = {
     'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown',
     'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown',
     'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone',
-    'Average thermal transmittance 0.33 W/m?K': 'unknown', 'Cavity wall,': 'unknown',
+    'Average thermal transmittance 0.33 W/m?K': 'unknown',
+    'Cavity wall,': "cavity unknown insulation",
     'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity',
     'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown',
     'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown',
@@ -55,7 +58,7 @@ WALL_CONSTRUCTION_MAPPINGS = {
     'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown',
     'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown',
     'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone',
-    'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': 'unknown',
+    'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': "cavity unknown insulation",
     'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity',
     'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown',
     'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown',
@@ -67,4 +70,20 @@ WALL_CONSTRUCTION_MAPPINGS = {
     'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown',
     'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown',
     'average thermal transmittance 0.28 w/m?k': 'unknown',
+    'Cavity wall, filled cavity': 'filled cavity',
+    'Cavity wall, filled cavity and external insulation': 'filled cavity',
+    'Granite or whinstone, as built, no insulation (assumed)': 'granite or '
+                                                               'whinstone',
+    'Solid brick, as built, insulated (assumed)': 'insulated solid brick',
+    'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick',
+    'Solid brick, with external insulation': 'insulated solid brick',
+    'Solid brick, with internal insulation': 'insulated solid brick',
+    'System built, as built, insulated (assumed)': 'system built',
+    'System built, as built, no insulation (assumed)': 'system built',
+    'System built, with external insulation': 'system built',
+    'System built, with internal insulation': 'system built',
+    'Timber frame, as built, insulated (assumed)': 'timber frame',
+    'Timber frame, as built, no insulation (assumed)': 'timber frame',
+    'Timber frame, as built, partial insulation (assumed)': 'timber frame',
+    'Timber frame, with additional insulation': 'timber frame',
 }
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 9754e726..fbf7e10d 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -4,7 +4,6 @@ import json
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
-from BaseUtility import Definitions
 from asset_list.AssetList import AssetList
 from asset_list.mappings.property_type import PROPERTY_MAPPING
 from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
@@ -14,13 +13,6 @@ from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS
 from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
 from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
-from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
-
-from recommendations.recommendation_utils import (
-    estimate_perimeter,
-    estimate_external_wall_area,
-    estimate_number_of_floors
-)
 
 from etl.epc_clean.epc_attributes.attribute_utils import (
     extract_thermal_transmittance
@@ -177,109 +169,6 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t
     raise ValueError(f"Method {method} not recognized")
 
 
-def process_age_band(asset_list, year_built_column):
-    processed_age_band = []
-    for _, x in asset_list.iterrows():
-
-        if pd.isnull(x["Property Age Band"]) or (
-            x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
-        ):
-            processed_age_band.append({
-                "row_id": x["row_id"],
-                "epc_year_lower_bound": None,
-                "epc_year_upper_bound": None,
-                "Does Age Match EPC Age Band?": "No EPC Age Band"
-            })
-            continue
-
-        # We exatract the upper and lower bounds
-        if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]:
-            year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012
-
-            if pd.isnull(x[year_built_column]):
-                age_band_matches = "No Year Built From Landlord"
-            else:
-                age_band_matches = (
-                    "EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound
-                    else "EPC Age Band is older than Year Built"
-                )
-
-            processed_age_band.append(
-                {
-                    "row_id": x["row_id"],
-                    "epc_year_lower_bound": year_lower_bound,
-                    "epc_year_upper_bound": None,
-                    "Does Age Match EPC Age Band?": age_band_matches
-                }
-            )
-            continue
-
-        if x["Property Age Band"] == "England and Wales: before 1900":
-
-            if pd.isnull(x[year_built_column]):
-                age_band_matches = "No Year Built From Landlord"
-            else:
-                age_band_matches = (
-                    "EPC Age Band Matches Year Built" if x[year_built_column] < 1900
-                    else "EPC Age Band is newer than Year Built"
-                )
-
-            processed_age_band.append(
-                {
-                    "row_id": x["row_id"],
-                    "epc_year_lower_bound": None,
-                    "epc_year_upper_bound": 1899,
-                    "Does Age Match EPC Age Band?": age_band_matches
-                }
-            )
-            continue
-
-        if x["Property Age Band"].isdigit():
-
-            if pd.isnull(x[year_built_column]):
-                age_band_matches = "No Year Built From Landlord"
-            else:
-                age_band_matches = (
-                    "EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"])
-                    else "EPC Age Band is different from Year Built"
-                )
-
-            processed_age_band.append(
-                {
-                    "row_id": x["row_id"],
-                    "epc_year_lower_bound": int(x["Property Age Band"]),
-                    "epc_year_upper_bound": int(x["Property Age Band"]),
-                    "Does Age Match EPC Age Band?": age_band_matches
-                }
-            )
-            continue
-
-        # Oherwise, we extract the upper and lower bounds
-        age_band = x["Property Age Band"].split(": ")[1]
-        lower_date, upper_date = age_band.split("-")
-
-        age_band_matches = (
-            "EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and (
-                x[year_built_column] <= float(upper_date)
-            )
-            else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date)
-            else "EPC Age Band is newer than Year Built"
-        )
-
-        processed_age_band.append(
-            {
-                "row_id": x["row_id"],
-                "epc_year_lower_bound": int(lower_date),
-                "epc_year_upper_bound": int(upper_date),
-                "Does Age Match EPC Age Band?": age_band_matches
-            }
-        )
-
-    processed_age_band = pd.DataFrame(processed_age_band)
-
-    return processed_age_band
-
-
 def app():
     """
     This app is EPC pulling data for some properties owned by Livewest
@@ -531,62 +420,11 @@ def app():
     # TODO: TEMP!!!
     epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str)
     asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge(
-        epc_df, how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn"
+        epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn"
     )
 
     asset_list.extract_attributes()
 
-    asset_list["Estimated Number of Floors"] = asset_list.apply(
-        lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
-            x["Property Type"]) else None, axis=1
-    )
-
-    asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
-    # Replace "" value with None
-    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
-    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
-
-    asset_list["Estimated Perimeter (m)"] = asset_list.apply(
-        lambda x: estimate_perimeter(
-            floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
-            num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
-        ), axis=1
-    )
-
-    asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
-        lambda x: estimate_external_wall_area(
-            num_floors=x["Estimated Number of Floors"],
-            floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
-            perimeter=x["Estimated Perimeter (m)"],
-            built_form=x["Archetype - EPC"]
-        ),
-        axis=1
-    )
-
-    asset_list["Roof Insulation Thickness"] = asset_list.apply(
-        lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
-            x["Roof Construction"]) else None,
-        axis=1
-    )
-
-    # We produce some additional fields
-    # 1) Is the SAP rating below C75
-    asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75
-    # 2) Flag anything where the EPC is older than 5 years
-    cutoff_year = pd.Timestamp.now().year - 5
-    asset_list[f"EPC is pre {cutoff_year}"] = (
-        pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year
-    )
-
-    # 3) If we have year in the asset list, we flag entries where the built year is different from the
-    # EPC Age band
-    if PROPERTY_YEAR_BUILT is not None:
-        # We process the age band and merge it on
-        processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT)
-        asset_list = asset_list.merge(
-            processed_age_band, how="left", on="row_id"
-        )
-
     if HAS_NON_INTRUSIVES:
         # Empty cavity:
         # 1) Has been flagged on the non-intrusives as being empty or partially filled

From 4db9d48e366e121abcfe83e2dfd335d33151bc68 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 12:39:06 +0000
Subject: [PATCH 54/72] adding the solar floor eligibiltiy criteria

---
 asset_list/AssetList.py          | 85 ++++++++++++++++++++++++++++----
 asset_list/requirements.txt      |  3 +-
 etl/route_march_data_pull/app.py | 28 ++++++++++-
 3 files changed, 105 insertions(+), 11 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 81aa525a..4666cf63 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -306,6 +306,17 @@ class AssetList:
         "cavity wall, as built, no insulation",
     ]
 
+    # List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated
+    EPC_INSULATED_WALLS_SUBSTRINGS = [
+        ", insulated", "with external insulation", "with internal insulation", "filled cavity"
+    ]
+
+    # List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated
+    EPC_INSULATED_ROOF_SUBSTRINGS = [
+        "(another dwelling above)", ", insulated", ", insulated (assumed) ",
+        ", ceiling insulated",
+    ]
+
     def __init__(
         self,
         local_filepath,
@@ -861,7 +872,10 @@ class AssetList:
             processed_age_band, how="left"
         )
 
-    def identify_worktypes(self):
+    def identify_worktypes(self, cleaned):
+
+        if not self.non_intrusives_present:
+            raise NotImplementedError("Need to implement the case for non-intrusives")
 
         # If we have non-intrusives completed, we can use this to identify work types
 
@@ -892,6 +906,17 @@ class AssetList:
                 )
             )
 
+            self.standardised_asset_list["empty_cavity"] = (
+                self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] |
+                self.standardised_asset_list["epc_indicates_empty_cavity"]
+            )
+            # We add a reason
+            self.standardised_asset_list["empty_cavity_reason"] = np.where(
+                self.standardised_asset_list["non_intrusive_indicates_empty_cavity"],
+                "Non-Intrusive Data",
+                "EPC Data"
+            )
+
             ######################################################
             # Extraction
             ######################################################
@@ -967,13 +992,55 @@ class AssetList:
                 )
             )
 
-            EPC_INSULATED_WALLS_SUBSTRINGS = [
-                ", insulated", "with external insulation", "with internal insulation", "filled cavity"
-            ]
+            # TODO: We don't have information about the roof from this landlord
+            self.standardised_asset_list["solar_epc_walls_insulated"] = (
+                self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains(
+                    "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS)
+                )
+            )
 
-            self.standardised_asset_list["landlord_wall_construction"].value_counts()
+            # We merge on the u-value for average thermal transmittance
+            roof_uvalue_data = pd.DataFrame(cleaned["roof-description"])
+            roof_uvalue_data = roof_uvalue_data[
+                ~pd.isnull(roof_uvalue_data["thermal_transmittance"])
+            ][["original_description", "thermal_transmittance"]].rename(
+                columns={
+                    "original_description": self.EPC_API_DATA_NAMES["roof-description"],
+                    "thermal_transmittance": "roof_u_value"
+                }
+            )
 
-            EPC_INSULATED_ROOF_SUBSTRINGS = [
-                "(another dwelling above)", "limited insulation", "(other premises above)",
-                ", no insulation",
-            ]
+            self.standardised_asset_list = self.standardised_asset_list.merge(
+                roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"]
+            )
+
+            # If the u-value of a roof is less than 0.7 we consider it insulated
+            self.standardised_asset_list["solar_epc_roof_insulated"] = (
+                self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains(
+                    "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False
+                ) | (
+                    self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
+                        lambda x: int(x) >= 270 if str(x).isdigit() else False
+                    )
+                ) | (
+                    self.standardised_asset_list["roof_u_value"].apply(
+                        lambda x: x <= 0.7 if not pd.isnull(x) else False
+                    )
+                )
+            )
+
+            self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[
+                self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
+                lambda x: int(x) < 270 if str(x).isdigit() else False
+            )
+
+            self.standardised_asset_list["solar_epc_floor_is_solid"] = self.standardised_asset_list[
+                self.EPC_API_DATA_NAMES["floor-description"]
+            ].str.lower().str.contains("solid")
+            self.standardised_asset_list["solar_epc_floor_is_solid"] = (
+                self.standardised_asset_list["solar_epc_floor_is_solid"].fillna(False)
+            )
+
+            z = self.standardised_asset_list[
+                self.standardised_asset_list["solar_epc_floor_is_solid"] == True
+                ]
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
index 0c16c43a..fd045d46 100644
--- a/asset_list/requirements.txt
+++ b/asset_list/requirements.txt
@@ -7,4 +7,5 @@ fuzzywuzzy
 boto3
 openpyxl
 openai
-tiktoken
\ No newline at end of file
+tiktoken
+msgpack
\ No newline at end of file
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index fbf7e10d..32c36fe8 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -391,13 +391,28 @@ def app():
     transformed_df = pd.DataFrame(transformed_data)
     # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation
     # recommendations
-    transformed_df = transformed_df[[asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation"]]
+    transformed_df = transformed_df[
+        [
+            asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation", "Floor insulation (solid floor)",
+            "Floor insulation", "Floor insulation (suspended floor)"
+        ]
+    ]
+
+    transformed_df["epc_has_floor_recommendation"] = (
+        transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] |
+        transformed_df["Floor insulation (suspended floor)"]
+    )
 
     # Get the find my epc data
     find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop(
         columns=["find_my_epc_data"]).join(
         pd.json_normalize(epc_df["find_my_epc_data"])
     )
+    find_my_epc_data = find_my_epc_data.merge(
+        transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]],
+        how="left", on=asset_list.DOMNA_PROPERTY_ID
+    )
+
     # We check if we get the solar pv column:
     if "Solar photovoltaics" not in find_my_epc_data.columns:
         find_my_epc_data["Solar photovoltaics"] = False
@@ -425,6 +440,17 @@ def app():
 
     asset_list.extract_attributes()
 
+    # TODO - Use this!
+    import msgpack
+    from utils.s3 import read_from_s3
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    asset_list.identify_worktypes(cleaned)
+
     if HAS_NON_INTRUSIVES:
         # Empty cavity:
         # 1) Has been flagged on the non-intrusives as being empty or partially filled

From c544c95282df3a9c50fc84ab46bd387f889a4b4d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 14:35:12 +0000
Subject: [PATCH 55/72] working on solar criteria

---
 asset_list/AssetList.py          | 105 +++++++++++++++++++++++++++----
 etl/route_march_data_pull/app.py |   8 +--
 2 files changed, 96 insertions(+), 17 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 4666cf63..056f8b5d 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -936,9 +936,6 @@ class AssetList:
             # Solar
             ######################################################
             # Criteria:
-
-            # TODO: Standardise these columns with our cleaned_data object
-
             # Check 1: Does the property have a valid heating system?
             self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = (
                 self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin(
@@ -993,9 +990,35 @@ class AssetList:
             )
 
             # TODO: We don't have information about the roof from this landlord
+
+            # We merge on the u-value for average thermal transmittance
+            walls_uvalue_data = pd.DataFrame(cleaned["walls-description"])
+            walls_uvalue_data = walls_uvalue_data[
+                ~pd.isnull(walls_uvalue_data["thermal_transmittance"])
+            ][["original_description", "thermal_transmittance"]].rename(
+                columns={
+                    "original_description": self.EPC_API_DATA_NAMES["walls-description"],
+                    "thermal_transmittance": "walls_u_value"
+                }
+            )
+            self.standardised_asset_list = self.standardised_asset_list.merge(
+                walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"]
+            )
+
             self.standardised_asset_list["solar_epc_walls_insulated"] = (
-                self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains(
-                    "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS)
+                (
+                    self.standardised_asset_list[
+                        self.EPC_API_DATA_NAMES[
+                            "walls-description"]].str.lower().str.contains(
+                        "|".join(
+                            self.EPC_INSULATED_WALLS_SUBSTRINGS)
+                    )
+                ) | (
+                    self.standardised_asset_list[
+                        "walls_u_value"].apply(
+                        lambda x: x <= 0.3 if not pd.isnull(
+                            x) else False
+                    )
                 )
             )
 
@@ -1034,13 +1057,69 @@ class AssetList:
                 lambda x: int(x) < 270 if str(x).isdigit() else False
             )
 
-            self.standardised_asset_list["solar_epc_floor_is_solid"] = self.standardised_asset_list[
-                self.EPC_API_DATA_NAMES["floor-description"]
-            ].str.lower().str.contains("solid")
-            self.standardised_asset_list["solar_epc_floor_is_solid"] = (
-                self.standardised_asset_list["solar_epc_floor_is_solid"].fillna(False)
+            # TODO: Fill with False - should be temp!
+            self.standardised_asset_list["epc_has_floor_recommendation"] = (
+                self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False)
             )
 
-            z = self.standardised_asset_list[
-                self.standardised_asset_list["solar_epc_floor_is_solid"] == True
-                ]
+            self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = (
+                (
+                    (
+                        self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str
+                        .lower().str.contains("solid")
+                    ) & (
+                        ~self.standardised_asset_list["epc_has_floor_recommendation"]
+                    )
+                ) | (
+                    (
+                        self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.contains("solid")
+                    ) & (
+                        self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower()
+                        .str.contains(", insulated")
+                    )
+                )
+            )
+
+            # We now put together the criteria:
+            # Flag properties that look eligible for solar, that have solid floors
+            # TODO: We'll need to revise this
+            self.standardised_asset_list["solar_eligible_solid_floor"] = (
+                # Landlord data or EPC data indicates the heating system is appropriate
+                (
+                    self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+                    self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"]
+                ) &
+                # The property doesn't currently have solar
+                ~self.standardised_asset_list["property_has_solar"] &
+                # The walls are insulated
+                (
+                    self.standardised_asset_list["solar_landlord_walls_insulated"] |
+                    self.standardised_asset_list["solar_epc_walls_insulated"]
+                ) &
+                # Roof is insulated
+                self.standardised_asset_list["solar_epc_roof_insulated"] &
+                self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"]
+            )
+
+            # Solid floor but needs a loft top-up
+            self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = (
+                # Landlord data or EPC data indicates the heating system is appropriate
+                (
+                    self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+                    self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"]
+                ) &
+                # The property doesn't currently have solar
+                ~self.standardised_asset_list["property_has_solar"] &
+                # The walls are insulated
+                (
+                    self.standardised_asset_list["solar_landlord_walls_insulated"] |
+                    self.standardised_asset_list["solar_epc_walls_insulated"]
+                ) &
+                # Roof is insulated
+                self.standardised_asset_list["solar_epc_loft_needs_topup"] &
+                self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"]
+            )
+
+            # Suspended floor, fully insulated
+
+            # ~self.standardised_asset_list["solar_epc_loft_needs_topup"] &
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 32c36fe8..0de85a27 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -389,11 +389,9 @@ def app():
         transformed_data.append(row_data)
 
     transformed_df = pd.DataFrame(transformed_data)
-    # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation
-    # recommendations
     transformed_df = transformed_df[
         [
-            asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation", "Floor insulation (solid floor)",
+            asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)",
             "Floor insulation", "Floor insulation (suspended floor)"
         ]
     ]
@@ -425,7 +423,9 @@ def app():
     )
 
     epc_df = epc_df.merge(
-        find_my_epc_data[[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.FIND_EPC_DATA_NAMES.keys())]
+        find_my_epc_data[
+            [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys())
+            ]
         .rename(columns=asset_list.FIND_EPC_DATA_NAMES),
         how="left",
         on=asset_list.DOMNA_PROPERTY_ID

From 84ae26a9133e91a3f1904db2407f2f84bfb7305a Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 15:14:18 +0000
Subject: [PATCH 56/72] added the eligibility criteria for solar and aggregate
 figures:

---
 asset_list/AssetList.py          | 117 ++++++++++++++++++++++++++++++-
 etl/route_march_data_pull/app.py |   1 -
 2 files changed, 114 insertions(+), 4 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 056f8b5d..ffe53d40 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -341,6 +341,8 @@ class AssetList:
         # Read in the data
         self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
         self.standardised_asset_list = self.raw_asset_list.copy()
+        # Will be used to store aggregated figures against the various work types
+        self.work_type_figures = {}
 
         # We detect the presence of the non-intrusive columns
         self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
@@ -1062,6 +1064,23 @@ class AssetList:
                 self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False)
             )
 
+            # We merge on the u-value for average thermal transmittance
+            floors_uvalue_data = pd.DataFrame(cleaned["floor-description"])
+            floors_uvalue_data = floors_uvalue_data[
+                ~pd.isnull(floors_uvalue_data["thermal_transmittance"])
+            ][["original_description", "thermal_transmittance"]].rename(
+                columns={
+                    "original_description": self.EPC_API_DATA_NAMES["floor-description"],
+                    "thermal_transmittance": "floor_u_value"
+                }
+            )
+
+            # Merge on
+            self.standardised_asset_list = self.standardised_asset_list.merge(
+                floors_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["floor-description"]
+            )
+
+            # We assume that a U-value of 0.5 or below is indicative of an insulated floor
             self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = (
                 (
                     (
@@ -1072,7 +1091,8 @@ class AssetList:
                     )
                 ) | (
                     (
-                        self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.contains("solid")
+                        self.standardised_asset_list[
+                            self.EPC_API_DATA_NAMES["floor-description"]].str.lower().str.contains("solid")
                     ) & (
                         self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower()
                         .str.contains(", insulated")
@@ -1080,6 +1100,33 @@ class AssetList:
                 )
             )
 
+            # Check for other floor types, insulated
+            self.standardised_asset_list["solar_epc_floor_is_other_insulated"] = (
+                # The floor is suspended and insulated
+                (
+                    (
+                        self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str
+                        .lower().str.contains("suspended")
+                    ) & (
+                        ~self.standardised_asset_list["epc_has_floor_recommendation"]
+                    )
+                ) | (
+                    (
+                        self.standardised_asset_list[
+                            self.EPC_API_DATA_NAMES["floor-description"]
+                        ].str.lower().str.contains("suspended")
+                    ) & (
+                        self.standardised_asset_list[
+                            self.EPC_API_DATA_NAMES["floor-description"]
+                        ].str.lower().str.contains(", insulated")
+                    )
+                ) | (
+                    self.standardised_asset_list["floor_u_value"].apply(
+                        lambda x: x <= 0.5 if not pd.isnull(x) else False
+                    )
+                )
+            )
+
             # We now put together the criteria:
             # Flag properties that look eligible for solar, that have solid floors
             # TODO: We'll need to revise this
@@ -1120,6 +1167,70 @@ class AssetList:
                 self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"]
             )
 
-            # Suspended floor, fully insulated
+            # Other floor type, fully insulated
+            self.standardised_asset_list["solar_eligible_other_floor"] = (
+                # Landlord data or EPC data indicates the heating system is appropriate
+                (
+                    self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+                    self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"]
+                ) &
+                # The property doesn't currently have solar
+                ~self.standardised_asset_list["property_has_solar"] &
+                # The walls are insulated
+                (
+                    self.standardised_asset_list["solar_landlord_walls_insulated"] |
+                    self.standardised_asset_list["solar_epc_walls_insulated"]
+                ) &
+                # Roof is insulated
+                self.standardised_asset_list["solar_epc_roof_insulated"] &
+                self.standardised_asset_list["solar_epc_floor_is_other_insulated"]
+            )
 
-            # ~self.standardised_asset_list["solar_epc_loft_needs_topup"] &
+            # Other floor type, needs loft top-up
+            self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = (
+                # Landlord data or EPC data indicates the heating system is appropriate
+                (
+                    self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+                    self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"]
+                ) &
+                # The property doesn't currently have solar
+                ~self.standardised_asset_list["property_has_solar"] &
+                # The walls are insulated
+                (
+                    self.standardised_asset_list["solar_landlord_walls_insulated"] |
+                    self.standardised_asset_list["solar_epc_walls_insulated"]
+                ) &
+                # Roof need loft top-up
+                self.standardised_asset_list["solar_epc_loft_needs_topup"] &
+                # Floor is not solid, but is insulated
+                self.standardised_asset_list["solar_epc_floor_is_other_insulated"]
+            )
+
+        # Produce some aggregate figures
+        self.work_type_figures = {
+            # Empty cavity from non-intrusives
+            "Empty Cavity (non-intrusives)": (
+                self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum()
+            ),
+            "Empty Cavity (EPC)": (
+                (
+                    self.standardised_asset_list["epc_indicates_empty_cavity"] &
+                    ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]
+                ).sum()
+            ),
+            "Cavity Extraction": (
+                self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"].sum()
+            ),
+            "Solar PV (Solid Floor)": (
+                self.standardised_asset_list["solar_eligible_solid_floor"].sum()
+            ),
+            "Solar PV (Solid Floor, Needs Loft Top-up)": (
+                self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"].sum()
+            ),
+            "Solar PV (Other Floor)": (
+                self.standardised_asset_list["solar_eligible_other_floor"].sum()
+            ),
+            "Solar PV (Other Floor, Needs Loft Top-up)": (
+                self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum()
+            )
+        }
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 0de85a27..5960f69b 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -440,7 +440,6 @@ def app():
 
     asset_list.extract_attributes()
 
-    # TODO - Use this!
     import msgpack
     from utils.s3 import read_from_s3
     cleaned = read_from_s3(

From 5df47a86ae889b4e26191550f79fc4720f2878a7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 15:15:58 +0000
Subject: [PATCH 57/72] removed cirular import

---
 asset_list/mappings/walls.py     |   2 -
 etl/route_march_data_pull/app.py | 126 +------------------------------
 2 files changed, 3 insertions(+), 125 deletions(-)

diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py
index c5cca599..1fc52fcb 100644
--- a/asset_list/mappings/walls.py
+++ b/asset_list/mappings/walls.py
@@ -1,5 +1,3 @@
-from asset_list.AssetList import DataRemapper
-
 STANDARD_WALL_CONSTRUCTIONS = {
     "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation",
     "timber frame", "uninsulated solid brick",
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 5960f69b..7bf3cca8 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -4,6 +4,8 @@ import json
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
+import msgpack
+from utils.s3 import read_from_s3
 from asset_list.AssetList import AssetList
 from asset_list.mappings.property_type import PROPERTY_MAPPING
 from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
@@ -440,8 +442,6 @@ def app():
 
     asset_list.extract_attributes()
 
-    import msgpack
-    from utils.s3 import read_from_s3
     cleaned = read_from_s3(
         s3_file_name="cleaned_epc_data/cleaned.bson",
         bucket_name="retrofit-data-dev"
@@ -450,114 +450,7 @@ def app():
 
     asset_list.identify_worktypes(cleaned)
 
-    if HAS_NON_INTRUSIVES:
-        # Empty cavity:
-        # 1) Has been flagged on the non-intrusives as being empty or partially filled
-        # 2) The age is before 1995
-        # 3) Remove anything that likley has access issues
-        asset_list["Suitable for Cavity Fill"] = (
-            (asset_list["Construction"] == "CAVITY") &
-            asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) &
-            (
-                # Shold we defer to the year built provided by the HA?
-                (asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995)
-            ) &
-            (
-                # We check if the property type column contains one of the invalid property types
-                ~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary))
-            )
-        )
-
-        # asset_list["Suitable for Extraction"] =
-        asset_list[
-            (asset_list["Construction"] == "Cavity") &
-            asset_list["Insulated"].isin(["RETRO DRILLED"]) &
-            (
-                (asset_list[PROPERTY_YEAR_BUILT] <= 1995)
-            ) &
-            (
-                asset_list[PROPERTY_TYPE_COLUMN]
-            )
-            ]
-
-    # 4) Flag properties that look like they're good candidates for solar installs
-    # Firstly, flag if the fabric is completely done
-
-    insulated_wall_substrings = [
-        ", insulated", "with external insulation", "with internal insulation", "filled cavity"
-    ]
-
-    insulated_roof_substrings = [
-        "(another dwelling above)", "limited insulation", "(other premises above)",
-        ", no insulation",
-    ]
-
-    def check_solar_insulation_conditions(x):
-
-        if pd.isnull(x["Wall Construction"]):
-            return None
-
-        if "average thermal transmittance" in x["Wall Construction"].lower():
-            # We extract out the u-values
-            wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"]
-            roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"]
-            floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"]
-
-            roof_uvalue = 0 if roof_uvalue is None else roof_uvalue
-            floor_uvalue = 0 if floor_uvalue is None else floor_uvalue
-
-            # We apply some cutoffs
-            if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7:
-                return "Walls, Roof and Floor have U-values below 0.7"
-
-            return "Confirm U-values"
-
-        walls_insulated = any(
-            insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings
-        )
-        roof_is_numeric = False
-        if str(x["Roof Insulation Thickness"]).isdigit():
-            roof_is_numeric = True
-            roof_insulated = int(x["Roof Insulation Thickness"]) >= 200
-        else:
-            roof_insulated = any(
-                insulated_substring in x["Roof Construction"].lower() for insulated_substring in
-                insulated_roof_substrings
-            )
-
-        floor_is_solid = "solid" in x["Floor Construction"].lower()
-
-        if walls_insulated and roof_insulated and floor_is_solid:
-            return "Walls Insulated, Roof Insulated, Floor Solid"
-
-        if walls_insulated and floor_is_solid and roof_is_numeric:
-            return "Walls Insulated, Floor Solid, Loft need top-up"
-
-        return "Not Fully Insulated or no data"
-
-    asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1)
-
-    asset_list["Good Solar Candidate"] = (
-        asset_list["SAP Rating is 75 and below"] &
-        ~asset_list["Has Solar PV"] &
-        (
-            asset_list["Heating Type"].isin(
-                [
-                    "Electric storage heaters",
-                    "Room heaters, electric",
-                ]
-            ) | asset_list["Heating Type"].str.contains("heat pump", case=False)
-        ) & (
-            asset_list["Solar Fabric Condition"].isin(
-                [
-                    "Walls Insulated, Roof Insulated, Floor Solid",
-                    "Walls, Roof and Floor have U-values below 0.7",
-                    "Walls Insulated, Floor Solid, Loft need top-up"
-                ]
-            )
-        )
-    )
-
+    # TODO: We should do this breakdown for flats
     def flat_analysis(asset_list):
 
         # We need to deduce the building name - we strip out the house number
@@ -596,19 +489,6 @@ def app():
 
     flat_data = flat_analysis(asset_list)
 
-    # For all of the columns in transformed_df, prefix with "Recommendation: "
-    for col in transformed_df.columns:
-        if col == "row_id":
-            continue
-        transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"})
-
-    asset_list = asset_list.merge(
-        transformed_df,
-        how="left",
-        on="row_id"
-    )
-    asset_list = asset_list.drop(columns=["row_id", "index"])
-
     # Store as an excel
     filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
     # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data

From d86ab5ff8df50e58248bff92582084462fc2166b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 15:18:53 +0000
Subject: [PATCH 58/72] restructuing app location

---
 asset_list/app.py                          | 497 ++++++++++++++++++++
 etl/route_march_data_pull/app.py           | 502 ---------------------
 etl/route_march_data_pull/requirements.txt |   0
 3 files changed, 497 insertions(+), 502 deletions(-)
 delete mode 100644 etl/route_march_data_pull/app.py
 delete mode 100644 etl/route_march_data_pull/requirements.txt

diff --git a/asset_list/app.py b/asset_list/app.py
index 21b405d8..1a7788fe 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -1 +1,498 @@
 import os
+import time
+import json
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import msgpack
+from utils.s3 import read_from_s3
+from asset_list.AssetList import AssetList
+from asset_list.mappings.property_type import PROPERTY_MAPPING
+from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
+from asset_list.mappings.heating_systems import HEATING_MAPPINGS
+from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS
+
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def get_data(
+    asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map,
+    uprn_column=None, epc_api_only=False, row_id_name="row_id"
+):
+    epc_data = []
+    errors = []
+    no_epc = []
+    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+        try:
+            postcode = home[postcode_column]
+            house_number = str(home[address1_column]).strip()
+            full_address = home[fulladdress_column].strip()
+            house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
+            if house_no is None:
+                house_no = house_number
+            uprn = manual_uprn_map.get(full_address, None)
+            if uprn is None and home.get(uprn_column):
+                uprn = home[uprn_column]
+
+            if pd.isnull(uprn):
+                uprn = None
+
+            searcher = SearchEpc(
+                address1=str(house_no),
+                postcode=postcode,
+                auth_token=EPC_AUTH_TOKEN,
+                os_api_key="",
+                property_type=None,
+                fast=True,
+                full_address=full_address,
+                max_retries=5,
+                uprn=uprn
+            )
+            # Force the skipping of estimating the EPC
+            searcher.ordnance_survey_client.property_type = None
+            searcher.ordnance_survey_client.built_form = None
+
+            searcher.find_property(skip_os=True)
+
+            # Check if we have a flat or appartment
+            if searcher.newest_epc is None and uprn is None:
+                # Try again:
+                if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
+                    # Backup
+                    add1 = full_address.split(",")
+                    if len(add1) > 1:
+                        add1 = add1[1].strip()
+                    else:
+                        # Try splitting on space
+                        add1 = full_address.split(" ")[0].strip()
+
+                else:
+                    add1 = str(house_number)
+                searcher = SearchEpc(
+                    address1=add1,
+                    postcode=postcode,
+                    auth_token=EPC_AUTH_TOKEN,
+                    os_api_key="",
+                    property_type=None,
+                    fast=True,
+                    full_address=full_address,
+                    max_retries=5
+                )
+
+                if (
+                    "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
+                    house_number.lower()
+                ):
+                    searcher.ordnance_survey_client.property_type = "Flat"
+
+                searcher.find_property(skip_os=True)
+
+            if searcher.newest_epc is None:
+                no_epc.append(home[row_id_name])
+                continue
+
+            if epc_api_only:
+                epc = {
+                    row_id_name: home[row_id_name],
+                    **searcher.newest_epc.copy()
+                }
+
+                epc_data.append(epc)
+                continue
+
+            # Look for EPC recommendatons
+            try:
+                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+            except:
+                property_recommendations = {"rows": []}
+
+            # Retrieve data from FindMyEPC
+            try:
+                find_epc_searcher = RetrieveFindMyEpc(
+                    address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
+                )
+                find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+            except ValueError as e:
+                if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
+                    try:
+                        find_epc_searcher = RetrieveFindMyEpc(
+                            address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
+                        )
+                        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+                    except ValueError as e:
+                        if "No EPC found" in str(e):
+                            find_epc_data = {}
+                else:
+                    find_epc_data = {}
+            except Exception as e:
+                raise Exception(f"Error retrieving FindMyEPC data: {e}")
+            time.sleep(np.random.uniform(0.1, 1))
+
+            epc = {
+                row_id_name: home[row_id_name],
+                **searcher.newest_epc.copy(),
+                "recommendations": property_recommendations["rows"],
+                "find_my_epc_data": find_epc_data,
+            }
+
+            epc_data.append(epc)
+        except Exception as e:
+            errors.append(home[row_id_name])
+            time.sleep(5)
+
+    return epc_data, errors, no_epc
+
+
+def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
+    if method == "first_two_words":
+        asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+        return asset_list
+
+    if method == "first_word":
+        asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
+        return asset_list
+
+    if method == "house_number_extraction":
+        asset_list["address1_extracted"] = asset_list.apply(
+            lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
+            axis=1
+        )
+        return asset_list
+
+    raise ValueError(f"Method {method} not recognized")
+
+
+def app():
+    """
+    This app is EPC pulling data for some properties owned by Livewest
+
+    Data request contents:
+    Date of last EPC
+    Reason for EPC
+    SAP score on register
+    Property Type
+    Property Area
+    Property Age
+    Any Dimensions (HLP,PW,RH)
+    Property Wall Construction
+    Heating Type
+    Secondary Heating
+    Loft Insulation Depth
+
+    Additional if possible:
+    Heat loss calculations
+    EPC recommendations
+    Property UPRN
+    """
+
+    # TODO:
+    # For cavity work:
+    # - Flag any entries that have a different wall type between non-intrusive data against EPC
+    # - Worth double checking entries that have a difference in wall construction
+    # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity
+    # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation
+    # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
+    # are less than C75
+    # - Flag anything pre SAP2012
+    # - Flag anything over 5 years old
+    # - Look at year built vs age band
+    #
+    # For Solar:
+    # - Discount any that have solar PV - based on non-intrusives and from the inspections team
+    # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with
+    # electric room heaters but it might need to be an EPC E
+    # - Fabric - check the floor, wall and roof:
+    #     - Filled or empty cavity is good
+    #     - Insulated solid/timber/system built is good
+    #     - SCIS/CEG needs solid floors
+    #     - JJC don’t care
+    #     - Anything with a loft 200 or below
+    # - Anything C75 and above won’t qualify
+    # - Insulated loft = 200mm
+    # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
+    # - Or the insulation required is loft/cavity (floors should be solid)
+
+    # For Westward
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
+    DATA_FILENAME = "WESTWARD - completed list..xlsx"
+    SHEET_NAME = "Sheet1"
+
+    POSTCODE_COLUMN = "WFT EDIT Postcode"
+    FULLADDRESS_COLUMN = "Address"
+    ADDRESS1_COLUMN = None
+    ADDRESS1_METHOD = "house_number_extraction"
+
+    ADDRESS_COLS_TO_CONCAT = []
+    MISSING_POSTCODES_METHOD = None
+    PROPERTY_YEAR_BUILT = "Build date"
+    UPRN_COLUMN = "UPRN"
+    # If we have the non-intrusives data, this should be true
+    HAS_NON_INTRUSIVES = True
+    PROPERTY_TYPE_COLUMN = "Location type"  # This will be used to identify and remove bedsits
+
+    # Maps addresses to uprn in problematic cases
+    MANUAL_UPRN_MAP = {}
+
+    asset_list = AssetList(
+        local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
+        header=0,
+        sheet_name=SHEET_NAME,
+        address1_colname=ADDRESS1_COLUMN,
+        postcode_colname=POSTCODE_COLUMN,
+        landlord_property_id="UPRN",
+        full_address_colname=FULLADDRESS_COLUMN,
+        full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
+        missing_postcodes_method=MISSING_POSTCODES_METHOD,
+        address1_extraction_method=ADDRESS1_METHOD,
+        landlord_year_built=PROPERTY_YEAR_BUILT,
+        landlord_uprn=UPRN_COLUMN,
+        landlord_property_type=PROPERTY_TYPE_COLUMN,
+        landlord_wall_construction="Wall Construction (EPC)",
+        landlord_heating_system="Heat Source",
+        landlord_existing_pv="PV (Y/N)"
+    )
+    asset_list.init_standardise()
+
+    # We produce the new maps, which can be saved for future useage
+
+    new_property_type_map = PROPERTY_MAPPING.copy().update(
+        asset_list.variable_mappings[asset_list.landlord_property_type]
+    )
+    new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update(
+        asset_list.variable_mappings[asset_list.landlord_wall_construction]
+    )
+    new_heating_map = HEATING_MAPPINGS.copy().update(
+        asset_list.variable_mappings[asset_list.landlord_heating_system]
+    )
+    new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update(
+        asset_list.variable_mappings[asset_list.landlord_existing_pv]
+    )
+
+    asset_list.apply_standardiation()
+
+    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+    # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+    # SHEET_NAME = "Sheet1"
+    # POSTCODE_COLUMN = 'Full Address.1'
+    # FULLADDRESS_COLUMN = "Full Address"
+    # ADDRESS1_COLUMN = None
+    # ADDRESS1_METHOD = "first_word"
+    # ADDRESS_COLS_TO_CONCAT = []
+    # MISSING_POSTCODES_METHOD = None
+    # PROPERTY_YEAR_BUILT = "Build Date"
+    # UPRN_COLUMN = None
+    # # If we have the non-intrusives data, this should be true
+    # HAS_NON_INTRUSIVES = True
+
+    ### We retrieve the EPC data
+
+    # We chunk up this data into 5000 rows at a time
+    # Create the chunks directory
+    force_retrieve_data = False
+    skip = None  # Used to skip already completed chunks
+    chunk_size = 5000
+    filename = "Chunk {i}.csv"
+    download_folder = os.path.join(DATA_FOLDER, "Chunks")
+    if not os.path.exists(download_folder):
+        os.makedirs(download_folder)
+
+    chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size))
+    downloaded_files = {filename.format(i=i) for i in chunk_indexes}
+
+    # We check if we have files associated to these files already and if we do, and we do not want to force the
+    # fetching of the data, we skip
+    folder_contents = os.listdir(download_folder)
+    if all(x in folder_contents for x in downloaded_files):
+        skip = max(chunk_indexes)
+
+    for i in range(0, len(asset_list.standardised_asset_list), chunk_size):
+        print(f"Processing chunk {i} to {i + chunk_size}")
+        if skip is not None and not force_retrieve_data:
+            if i <= skip:
+                continue
+        chunk = asset_list.standardised_asset_list[i:i + chunk_size]
+        epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
+            asset_list=chunk,
+            row_id_name=asset_list.DOMNA_PROPERTY_ID,
+            fulladdress_column=asset_list.STANDARD_FULL_ADDRESS,
+            address1_column=asset_list.STANDARD_ADDRESS_1,
+            postcode_column=asset_list.STANDARD_POSTCODE,
+            manual_uprn_map=MANUAL_UPRN_MAP,
+            uprn_column=asset_list.STANDARD_UPRN
+        )
+
+        # We now retrieve any failed properties
+        chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)]
+        epc_data_failed, _, _ = get_data(
+            asset_list=chunk_failed,
+            row_id_name=asset_list.DOMNA_PROPERTY_ID,
+            fulladdress_column=FULLADDRESS_COLUMN,
+            address1_column=ADDRESS1_COLUMN,
+            postcode_column=POSTCODE_COLUMN,
+            manual_uprn_map=MANUAL_UPRN_MAP,
+            epc_api_only=False
+        )
+
+        epc_data_chunk.extend(epc_data_failed)
+
+        # Append the failed data to the main data
+        # Store the chunk locally as a csv
+        pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)
+        # Store the errors and no-data locally
+        with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f:
+            json.dump(errors_chunk, f)
+
+        with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f:
+            json.dump(no_epc_chunk, f)
+
+    # We read in and concatenate the created created chunks
+    # List the contents
+    epc_data = []
+    for file in downloaded_files:
+        csv_data = pd.read_csv(os.path.join(download_folder, file))
+        # We need to convert the recommendations back to a list
+        csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
+        csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
+        epc_data.append(csv_data)
+
+    epc_df = pd.concat(epc_data)
+    # TODO: TEMP!!!
+    epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID})
+
+    # We expand out the recommendations
+    recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
+
+    unique_recommendations = set()
+    for _, row in recommendations_df.iterrows():
+        unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+    columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations)
+    transformed_data = []
+    for _, row in recommendations_df.iterrows():
+        # Initialize a dictionary for this row with False for all recommendations
+        row_data = {col: False for col in columns}
+        row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID]
+
+        # Set True for each recommendation present in this row
+        for rec in row["recommendations"]:
+            recommendation_text = rec["improvement-summary-text"]
+            row_data[recommendation_text] = True
+
+        # Append the row data to transformed_data
+        transformed_data.append(row_data)
+
+    transformed_df = pd.DataFrame(transformed_data)
+    transformed_df = transformed_df[
+        [
+            asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)",
+            "Floor insulation", "Floor insulation (suspended floor)"
+        ]
+    ]
+
+    transformed_df["epc_has_floor_recommendation"] = (
+        transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] |
+        transformed_df["Floor insulation (suspended floor)"]
+    )
+
+    # Get the find my epc data
+    find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop(
+        columns=["find_my_epc_data"]).join(
+        pd.json_normalize(epc_df["find_my_epc_data"])
+    )
+    find_my_epc_data = find_my_epc_data.merge(
+        transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]],
+        how="left", on=asset_list.DOMNA_PROPERTY_ID
+    )
+
+    # We check if we get the solar pv column:
+    if "Solar photovoltaics" not in find_my_epc_data.columns:
+        find_my_epc_data["Solar photovoltaics"] = False
+
+    # Retrieve just the data we need
+    epc_df = epc_df[
+        [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
+        ].rename(
+        columns=asset_list.EPC_API_DATA_NAMES
+    )
+
+    epc_df = epc_df.merge(
+        find_my_epc_data[
+            [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys())
+            ]
+        .rename(columns=asset_list.FIND_EPC_DATA_NAMES),
+        how="left",
+        on=asset_list.DOMNA_PROPERTY_ID
+    )
+
+    asset_list.merge_data(epc_df)
+    # TODO: TEMP!!!
+    epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str)
+    asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge(
+        epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn"
+    )
+
+    asset_list.extract_attributes()
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    asset_list.identify_worktypes(cleaned)
+
+    # TODO: We should do this breakdown for flats
+    def flat_analysis(asset_list):
+
+        # We need to deduce the building name - we strip out the house number
+        def extract_building_name(x):
+            # TODO: This doesn't really work
+            if pd.isnull(x):
+                return None
+            house_no = SearchEpc.get_house_number(address=x, postcode=None)
+            if house_no:
+                return x.replace(house_no, "").strip()
+            return x.split(",")[0].strip()
+
+        # We want to deduce if flats have 50% of the properties below C75
+        # We group by postcode and property type
+        grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"])
+
+        flat_data = []
+        for _, group in grouped:
+            if "flat" in group["Property Type"].str.lower().values:
+                num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0)
+                num_below_c75 = group["SAP score on register"].lt(75).sum()
+
+                flat_data.append(
+                    {
+                        "Postcode": group[POSTCODE_COLUMN].iloc[0],
+                        "Property Type": "Flat",
+                        "Number of Flats with EPC": num_flats,
+                        "Number of Flats below C75": num_below_c75,
+                        "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats)
+                    }
+                )
+
+        flat_data = pd.DataFrame(flat_data)
+
+        return flat_data
+
+    flat_data = flat_analysis(asset_list)
+
+    # Store as an excel
+    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
+    # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
+
+    with pd.ExcelWriter(filename) as writer:
+        asset_list.to_excel(writer, sheet_name="EPC Data", index=False)
+        flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
+
+    matches_review = asset_list[
+        [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
+    ]
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
deleted file mode 100644
index 7bf3cca8..00000000
--- a/etl/route_march_data_pull/app.py
+++ /dev/null
@@ -1,502 +0,0 @@
-import os
-import time
-import json
-import pandas as pd
-import numpy as np
-from tqdm import tqdm
-import msgpack
-from utils.s3 import read_from_s3
-from asset_list.AssetList import AssetList
-from asset_list.mappings.property_type import PROPERTY_MAPPING
-from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
-from asset_list.mappings.heating_systems import HEATING_MAPPINGS
-from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS
-
-from dotenv import load_dotenv
-from backend.SearchEpc import SearchEpc
-from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
-
-from etl.epc_clean.epc_attributes.attribute_utils import (
-    extract_thermal_transmittance
-)
-
-load_dotenv(dotenv_path="backend/.env")
-EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
-
-
-def get_data(
-    asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map,
-    uprn_column=None, epc_api_only=False, row_id_name="row_id"
-):
-    epc_data = []
-    errors = []
-    no_epc = []
-    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
-        try:
-            postcode = home[postcode_column]
-            house_number = str(home[address1_column]).strip()
-            full_address = home[fulladdress_column].strip()
-            house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
-            if house_no is None:
-                house_no = house_number
-            uprn = manual_uprn_map.get(full_address, None)
-            if uprn is None and home.get(uprn_column):
-                uprn = home[uprn_column]
-
-            if pd.isnull(uprn):
-                uprn = None
-
-            searcher = SearchEpc(
-                address1=str(house_no),
-                postcode=postcode,
-                auth_token=EPC_AUTH_TOKEN,
-                os_api_key="",
-                property_type=None,
-                fast=True,
-                full_address=full_address,
-                max_retries=5,
-                uprn=uprn
-            )
-            # Force the skipping of estimating the EPC
-            searcher.ordnance_survey_client.property_type = None
-            searcher.ordnance_survey_client.built_form = None
-
-            searcher.find_property(skip_os=True)
-
-            # Check if we have a flat or appartment
-            if searcher.newest_epc is None and uprn is None:
-                # Try again:
-                if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
-                    # Backup
-                    add1 = full_address.split(",")
-                    if len(add1) > 1:
-                        add1 = add1[1].strip()
-                    else:
-                        # Try splitting on space
-                        add1 = full_address.split(" ")[0].strip()
-
-                else:
-                    add1 = str(house_number)
-                searcher = SearchEpc(
-                    address1=add1,
-                    postcode=postcode,
-                    auth_token=EPC_AUTH_TOKEN,
-                    os_api_key="",
-                    property_type=None,
-                    fast=True,
-                    full_address=full_address,
-                    max_retries=5
-                )
-
-                if (
-                    "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
-                    house_number.lower()
-                ):
-                    searcher.ordnance_survey_client.property_type = "Flat"
-
-                searcher.find_property(skip_os=True)
-
-            if searcher.newest_epc is None:
-                no_epc.append(home[row_id_name])
-                continue
-
-            if epc_api_only:
-                epc = {
-                    row_id_name: home[row_id_name],
-                    **searcher.newest_epc.copy()
-                }
-
-                epc_data.append(epc)
-                continue
-
-            # Look for EPC recommendatons
-            try:
-                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
-            except:
-                property_recommendations = {"rows": []}
-
-            # Retrieve data from FindMyEPC
-            try:
-                find_epc_searcher = RetrieveFindMyEpc(
-                    address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
-                )
-                find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
-            except ValueError as e:
-                if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
-                    try:
-                        find_epc_searcher = RetrieveFindMyEpc(
-                            address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
-                        )
-                        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
-                    except ValueError as e:
-                        if "No EPC found" in str(e):
-                            find_epc_data = {}
-                else:
-                    find_epc_data = {}
-            except Exception as e:
-                raise Exception(f"Error retrieving FindMyEPC data: {e}")
-            time.sleep(np.random.uniform(0.1, 1))
-
-            epc = {
-                row_id_name: home[row_id_name],
-                **searcher.newest_epc.copy(),
-                "recommendations": property_recommendations["rows"],
-                "find_my_epc_data": find_epc_data,
-            }
-
-            epc_data.append(epc)
-        except Exception as e:
-            errors.append(home[row_id_name])
-            time.sleep(5)
-
-    return epc_data, errors, no_epc
-
-
-def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
-    if method == "first_two_words":
-        asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
-        return asset_list
-
-    if method == "first_word":
-        asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
-        return asset_list
-
-    if method == "house_number_extraction":
-        asset_list["address1_extracted"] = asset_list.apply(
-            lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
-            axis=1
-        )
-        return asset_list
-
-    raise ValueError(f"Method {method} not recognized")
-
-
-def app():
-    """
-    This app is EPC pulling data for some properties owned by Livewest
-
-    Data request contents:
-    Date of last EPC
-    Reason for EPC
-    SAP score on register
-    Property Type
-    Property Area
-    Property Age
-    Any Dimensions (HLP,PW,RH)
-    Property Wall Construction
-    Heating Type
-    Secondary Heating
-    Loft Insulation Depth
-
-    Additional if possible:
-    Heat loss calculations
-    EPC recommendations
-    Property UPRN
-    """
-
-    # TODO:
-    # For cavity work:
-    # - Flag any entries that have a different wall type between non-intrusive data against EPC
-    # - Worth double checking entries that have a difference in wall construction
-    # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity
-    # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation
-    # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
-    # are less than C75
-    # - Flag anything pre SAP2012
-    # - Flag anything over 5 years old
-    # - Look at year built vs age band
-    #
-    # For Solar:
-    # - Discount any that have solar PV - based on non-intrusives and from the inspections team
-    # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with
-    # electric room heaters but it might need to be an EPC E
-    # - Fabric - check the floor, wall and roof:
-    #     - Filled or empty cavity is good
-    #     - Insulated solid/timber/system built is good
-    #     - SCIS/CEG needs solid floors
-    #     - JJC don’t care
-    #     - Anything with a loft 200 or below
-    # - Anything C75 and above won’t qualify
-    # - Insulated loft = 200mm
-    # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
-    # - Or the insulation required is loft/cavity (floors should be solid)
-
-    # For Westward
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
-    DATA_FILENAME = "WESTWARD - completed list..xlsx"
-    SHEET_NAME = "Sheet1"
-
-    POSTCODE_COLUMN = "WFT EDIT Postcode"
-    FULLADDRESS_COLUMN = "Address"
-    ADDRESS1_COLUMN = None
-    ADDRESS1_METHOD = "house_number_extraction"
-
-    ADDRESS_COLS_TO_CONCAT = []
-    MISSING_POSTCODES_METHOD = None
-    PROPERTY_YEAR_BUILT = "Build date"
-    UPRN_COLUMN = "UPRN"
-    # If we have the non-intrusives data, this should be true
-    HAS_NON_INTRUSIVES = True
-    PROPERTY_TYPE_COLUMN = "Location type"  # This will be used to identify and remove bedsits
-
-    # Maps addresses to uprn in problematic cases
-    MANUAL_UPRN_MAP = {}
-
-    asset_list = AssetList(
-        local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
-        header=0,
-        sheet_name=SHEET_NAME,
-        address1_colname=ADDRESS1_COLUMN,
-        postcode_colname=POSTCODE_COLUMN,
-        landlord_property_id="UPRN",
-        full_address_colname=FULLADDRESS_COLUMN,
-        full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
-        missing_postcodes_method=MISSING_POSTCODES_METHOD,
-        address1_extraction_method=ADDRESS1_METHOD,
-        landlord_year_built=PROPERTY_YEAR_BUILT,
-        landlord_uprn=UPRN_COLUMN,
-        landlord_property_type=PROPERTY_TYPE_COLUMN,
-        landlord_wall_construction="Wall Construction (EPC)",
-        landlord_heating_system="Heat Source",
-        landlord_existing_pv="PV (Y/N)"
-    )
-    asset_list.init_standardise()
-
-    # We produce the new maps, which can be saved for future useage
-
-    new_property_type_map = PROPERTY_MAPPING.copy().update(
-        asset_list.variable_mappings[asset_list.landlord_property_type]
-    )
-    new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update(
-        asset_list.variable_mappings[asset_list.landlord_wall_construction]
-    )
-    new_heating_map = HEATING_MAPPINGS.copy().update(
-        asset_list.variable_mappings[asset_list.landlord_heating_system]
-    )
-    new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update(
-        asset_list.variable_mappings[asset_list.landlord_existing_pv]
-    )
-
-    asset_list.apply_standardiation()
-
-    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
-    # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
-    # SHEET_NAME = "Sheet1"
-    # POSTCODE_COLUMN = 'Full Address.1'
-    # FULLADDRESS_COLUMN = "Full Address"
-    # ADDRESS1_COLUMN = None
-    # ADDRESS1_METHOD = "first_word"
-    # ADDRESS_COLS_TO_CONCAT = []
-    # MISSING_POSTCODES_METHOD = None
-    # PROPERTY_YEAR_BUILT = "Build Date"
-    # UPRN_COLUMN = None
-    # # If we have the non-intrusives data, this should be true
-    # HAS_NON_INTRUSIVES = True
-
-    ### We retrieve the EPC data
-
-    # We chunk up this data into 5000 rows at a time
-    # Create the chunks directory
-    force_retrieve_data = False
-    skip = None  # Used to skip already completed chunks
-    chunk_size = 5000
-    filename = "Chunk {i}.csv"
-    download_folder = os.path.join(DATA_FOLDER, "Chunks")
-    if not os.path.exists(download_folder):
-        os.makedirs(download_folder)
-
-    chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size))
-    downloaded_files = {filename.format(i=i) for i in chunk_indexes}
-
-    # We check if we have files associated to these files already and if we do, and we do not want to force the
-    # fetching of the data, we skip
-    folder_contents = os.listdir(download_folder)
-    if all(x in folder_contents for x in downloaded_files):
-        skip = max(chunk_indexes)
-
-    for i in range(0, len(asset_list.standardised_asset_list), chunk_size):
-        print(f"Processing chunk {i} to {i + chunk_size}")
-        if skip is not None and not force_retrieve_data:
-            if i <= skip:
-                continue
-        chunk = asset_list.standardised_asset_list[i:i + chunk_size]
-        epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
-            asset_list=chunk,
-            row_id_name=asset_list.DOMNA_PROPERTY_ID,
-            fulladdress_column=asset_list.STANDARD_FULL_ADDRESS,
-            address1_column=asset_list.STANDARD_ADDRESS_1,
-            postcode_column=asset_list.STANDARD_POSTCODE,
-            manual_uprn_map=MANUAL_UPRN_MAP,
-            uprn_column=asset_list.STANDARD_UPRN
-        )
-
-        # We now retrieve any failed properties
-        chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)]
-        epc_data_failed, _, _ = get_data(
-            asset_list=chunk_failed,
-            row_id_name=asset_list.DOMNA_PROPERTY_ID,
-            fulladdress_column=FULLADDRESS_COLUMN,
-            address1_column=ADDRESS1_COLUMN,
-            postcode_column=POSTCODE_COLUMN,
-            manual_uprn_map=MANUAL_UPRN_MAP,
-            epc_api_only=False
-        )
-
-        epc_data_chunk.extend(epc_data_failed)
-
-        # Append the failed data to the main data
-        # Store the chunk locally as a csv
-        pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)
-        # Store the errors and no-data locally
-        with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f:
-            json.dump(errors_chunk, f)
-
-        with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f:
-            json.dump(no_epc_chunk, f)
-
-    # We read in and concatenate the created created chunks
-    # List the contents
-    epc_data = []
-    for file in downloaded_files:
-        csv_data = pd.read_csv(os.path.join(download_folder, file))
-        # We need to convert the recommendations back to a list
-        csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
-        csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
-        epc_data.append(csv_data)
-
-    epc_df = pd.concat(epc_data)
-    # TODO: TEMP!!!
-    epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID})
-
-    # We expand out the recommendations
-    recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
-
-    unique_recommendations = set()
-    for _, row in recommendations_df.iterrows():
-        unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
-
-    columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations)
-    transformed_data = []
-    for _, row in recommendations_df.iterrows():
-        # Initialize a dictionary for this row with False for all recommendations
-        row_data = {col: False for col in columns}
-        row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID]
-
-        # Set True for each recommendation present in this row
-        for rec in row["recommendations"]:
-            recommendation_text = rec["improvement-summary-text"]
-            row_data[recommendation_text] = True
-
-        # Append the row data to transformed_data
-        transformed_data.append(row_data)
-
-    transformed_df = pd.DataFrame(transformed_data)
-    transformed_df = transformed_df[
-        [
-            asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)",
-            "Floor insulation", "Floor insulation (suspended floor)"
-        ]
-    ]
-
-    transformed_df["epc_has_floor_recommendation"] = (
-        transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] |
-        transformed_df["Floor insulation (suspended floor)"]
-    )
-
-    # Get the find my epc data
-    find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop(
-        columns=["find_my_epc_data"]).join(
-        pd.json_normalize(epc_df["find_my_epc_data"])
-    )
-    find_my_epc_data = find_my_epc_data.merge(
-        transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]],
-        how="left", on=asset_list.DOMNA_PROPERTY_ID
-    )
-
-    # We check if we get the solar pv column:
-    if "Solar photovoltaics" not in find_my_epc_data.columns:
-        find_my_epc_data["Solar photovoltaics"] = False
-
-    # Retrieve just the data we need
-    epc_df = epc_df[
-        [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
-        ].rename(
-        columns=asset_list.EPC_API_DATA_NAMES
-    )
-
-    epc_df = epc_df.merge(
-        find_my_epc_data[
-            [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys())
-            ]
-        .rename(columns=asset_list.FIND_EPC_DATA_NAMES),
-        how="left",
-        on=asset_list.DOMNA_PROPERTY_ID
-    )
-
-    asset_list.merge_data(epc_df)
-    # TODO: TEMP!!!
-    epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str)
-    asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge(
-        epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn"
-    )
-
-    asset_list.extract_attributes()
-
-    cleaned = read_from_s3(
-        s3_file_name="cleaned_epc_data/cleaned.bson",
-        bucket_name="retrofit-data-dev"
-    )
-    cleaned = msgpack.unpackb(cleaned, raw=False)
-
-    asset_list.identify_worktypes(cleaned)
-
-    # TODO: We should do this breakdown for flats
-    def flat_analysis(asset_list):
-
-        # We need to deduce the building name - we strip out the house number
-        def extract_building_name(x):
-            # TODO: This doesn't really work
-            if pd.isnull(x):
-                return None
-            house_no = SearchEpc.get_house_number(address=x, postcode=None)
-            if house_no:
-                return x.replace(house_no, "").strip()
-            return x.split(",")[0].strip()
-
-        # We want to deduce if flats have 50% of the properties below C75
-        # We group by postcode and property type
-        grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"])
-
-        flat_data = []
-        for _, group in grouped:
-            if "flat" in group["Property Type"].str.lower().values:
-                num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0)
-                num_below_c75 = group["SAP score on register"].lt(75).sum()
-
-                flat_data.append(
-                    {
-                        "Postcode": group[POSTCODE_COLUMN].iloc[0],
-                        "Property Type": "Flat",
-                        "Number of Flats with EPC": num_flats,
-                        "Number of Flats below C75": num_below_c75,
-                        "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats)
-                    }
-                )
-
-        flat_data = pd.DataFrame(flat_data)
-
-        return flat_data
-
-    flat_data = flat_analysis(asset_list)
-
-    # Store as an excel
-    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
-    # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
-
-    with pd.ExcelWriter(filename) as writer:
-        asset_list.to_excel(writer, sheet_name="EPC Data", index=False)
-        flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
-
-    matches_review = asset_list[
-        [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
-    ]
diff --git a/etl/route_march_data_pull/requirements.txt b/etl/route_march_data_pull/requirements.txt
deleted file mode 100644
index e69de29b..00000000

From 759e81f6606ee9355612ed9526acd8c77dc12096 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 15:25:38 +0000
Subject: [PATCH 59/72] refactoring

---
 asset_list/app.py           | 20 +++++++++++++++-----
 asset_list/requirements.txt |  3 ++-
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/asset_list/app.py b/asset_list/app.py
index 1a7788fe..df2fe9cc 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -21,13 +21,21 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
 def get_data(
-    asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map,
+    df, fulladdress_column, address1_column, postcode_column, manual_uprn_map,
     uprn_column=None, epc_api_only=False, row_id_name="row_id"
 ):
+    # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs
+    property_type_map = {
+        "house": "House",
+        "flat": "Flat",
+        "maisonette": "Maisonette",
+        "bungalow": "Bungalow",
+    }
+
     epc_data = []
     errors = []
     no_epc = []
-    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+    for _, home in tqdm(df.iterrows(), total=len(df)):
         try:
             postcode = home[postcode_column]
             house_number = str(home[address1_column]).strip()
@@ -42,19 +50,21 @@ def get_data(
             if pd.isnull(uprn):
                 uprn = None
 
+            property_type = property_type_map.get(home[AssetList.STANDARD_PROPERTY_TYPE], None)
+
             searcher = SearchEpc(
                 address1=str(house_no),
                 postcode=postcode,
                 auth_token=EPC_AUTH_TOKEN,
                 os_api_key="",
-                property_type=None,
+                property_type=property_type,
                 fast=True,
                 full_address=full_address,
                 max_retries=5,
                 uprn=uprn
             )
             # Force the skipping of estimating the EPC
-            searcher.ordnance_survey_client.property_type = None
+            searcher.ordnance_survey_client.property_type = property_type
             searcher.ordnance_survey_client.built_form = None
 
             searcher.find_property(skip_os=True)
@@ -317,7 +327,7 @@ def app():
                 continue
         chunk = asset_list.standardised_asset_list[i:i + chunk_size]
         epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
-            asset_list=chunk,
+            df=chunk,
             row_id_name=asset_list.DOMNA_PROPERTY_ID,
             fulladdress_column=asset_list.STANDARD_FULL_ADDRESS,
             address1_column=asset_list.STANDARD_ADDRESS_1,
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
index fd045d46..fd43ac64 100644
--- a/asset_list/requirements.txt
+++ b/asset_list/requirements.txt
@@ -8,4 +8,5 @@ boto3
 openpyxl
 openai
 tiktoken
-msgpack
\ No newline at end of file
+msgpack
+beautifulsoup4
\ No newline at end of file

From 33558957df5b718fd81f9a89064f24ceffa2b139 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 17:00:12 +0000
Subject: [PATCH 60/72] adding methodology to estimate the EPC if we don't have
 it

---
 asset_list/app.py    | 22 +++++++++++++---------
 backend/SearchEpc.py |  2 +-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/asset_list/app.py b/asset_list/app.py
index df2fe9cc..5bbf25d4 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -21,9 +21,13 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
 def get_data(
-    df, fulladdress_column, address1_column, postcode_column, manual_uprn_map,
-    uprn_column=None, epc_api_only=False, row_id_name="row_id"
+    df, manual_uprn_map, epc_api_only=False, row_id_name="row_id"
 ):
+    uprn_column = AssetList.STANDARD_UPRN
+    fulladdress_column = AssetList.STANDARD_FULL_ADDRESS
+    address1_column = AssetList.STANDARD_ADDRESS_1
+    postcode_column = AssetList.STANDARD_POSTCODE
+
     # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs
     property_type_map = {
         "house": "House",
@@ -57,14 +61,14 @@ def get_data(
                 postcode=postcode,
                 auth_token=EPC_AUTH_TOKEN,
                 os_api_key="",
-                property_type=property_type,
+                property_type=None,
                 fast=True,
                 full_address=full_address,
                 max_retries=5,
                 uprn=uprn
             )
             # Force the skipping of estimating the EPC
-            searcher.ordnance_survey_client.property_type = property_type
+            searcher.ordnance_survey_client.property_type = None
             searcher.ordnance_survey_client.built_form = None
 
             searcher.find_property(skip_os=True)
@@ -102,6 +106,11 @@ def get_data(
 
                 searcher.find_property(skip_os=True)
 
+            # As a final resort, we estimate the EPC
+            if property_type is not None:
+                searcher.ordnance_survey_client.property_type = property_type
+                searcher.find_property(skip_os=True)
+
             if searcher.newest_epc is None:
                 no_epc.append(home[row_id_name])
                 continue
@@ -328,12 +337,7 @@ def app():
         chunk = asset_list.standardised_asset_list[i:i + chunk_size]
         epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
             df=chunk,
-            row_id_name=asset_list.DOMNA_PROPERTY_ID,
-            fulladdress_column=asset_list.STANDARD_FULL_ADDRESS,
-            address1_column=asset_list.STANDARD_ADDRESS_1,
-            postcode_column=asset_list.STANDARD_POSTCODE,
             manual_uprn_map=MANUAL_UPRN_MAP,
-            uprn_column=asset_list.STANDARD_UPRN
         )
 
         # We now retrieve any failed properties
diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index 79a041ec..0d921bec 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -337,7 +337,7 @@ class SearchEpc:
             if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
         ]
 
-        if data:
+        if data["rows"]:
             api_response["msg"] = self.SUCCESS
 
         return api_response["msg"]

From d69baa21dab3c066b20b3823f9bac52da4eba7da Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 17:22:00 +0000
Subject: [PATCH 61/72] estimating epcs

---
 asset_list/app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/asset_list/app.py b/asset_list/app.py
index 5bbf25d4..229bf171 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -337,6 +337,7 @@ def app():
         chunk = asset_list.standardised_asset_list[i:i + chunk_size]
         epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
             df=chunk,
+            row_id_name=asset_list.DOMNA_PROPERTY_ID,
             manual_uprn_map=MANUAL_UPRN_MAP,
         )
 

From d1dc536ab0c4424ac6fda9c39422659a547e8fbe Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 17:33:18 +0000
Subject: [PATCH 62/72] merging on epc data

---
 asset_list/AssetList.py | 2 +-
 asset_list/app.py       | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index ffe53d40..2d224daa 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -751,7 +751,7 @@ class AssetList:
         # We produce some additional fields
         # 1) Is the SAP rating below C75
         self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = (
-            self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <=
+            self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].astype(float) <=
             self.FILLED_CAVITY_SAP_THRESHOLD
         )
         # 2) Flag anything where the EPC is older than 5 years
diff --git a/asset_list/app.py b/asset_list/app.py
index 229bf171..34cc9579 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -376,8 +376,6 @@ def app():
         epc_data.append(csv_data)
 
     epc_df = pd.concat(epc_data)
-    # TODO: TEMP!!!
-    epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID})
 
     # We expand out the recommendations
     recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
@@ -445,11 +443,6 @@ def app():
     )
 
     asset_list.merge_data(epc_df)
-    # TODO: TEMP!!!
-    epc_df["epc_os_uprn"] = epc_df["epc_os_uprn"].astype("Int64").astype(str)
-    asset_list.standardised_asset_list = asset_list.standardised_asset_list.merge(
-        epc_df.drop(columns=["domna_property_id"]), how="left", left_on="ordnance_survey_uprn", right_on="epc_os_uprn"
-    )
 
     asset_list.extract_attributes()
 

From ea1a7b559d7fd3fa1c3f4b54365fe2eeebf0a3b3 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 21 Feb 2025 22:57:56 +0000
Subject: [PATCH 63/72] fixed bug with calling find epc

---
 asset_list/app.py                    | 10 +++++-----
 etl/find_my_epc/RetrieveFindMyEpc.py |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/asset_list/app.py b/asset_list/app.py
index 34cc9579..3c1ab627 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -34,6 +34,9 @@ def get_data(
         "flat": "Flat",
         "maisonette": "Maisonette",
         "bungalow": "Bungalow",
+        "block house": "House",
+        "coach house": "House",
+        "bedsit": "Flat"
     }
 
     epc_data = []
@@ -107,7 +110,7 @@ def get_data(
                 searcher.find_property(skip_os=True)
 
             # As a final resort, we estimate the EPC
-            if property_type is not None:
+            if property_type is not None and searcher.newest_epc is None:
                 searcher.ordnance_survey_client.property_type = property_type
                 searcher.find_property(skip_os=True)
 
@@ -344,11 +347,8 @@ def app():
         # We now retrieve any failed properties
         chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)]
         epc_data_failed, _, _ = get_data(
-            asset_list=chunk_failed,
+            df=chunk_failed,
             row_id_name=asset_list.DOMNA_PROPERTY_ID,
-            fulladdress_column=FULLADDRESS_COLUMN,
-            address1_column=ADDRESS1_COLUMN,
-            postcode_column=POSTCODE_COLUMN,
             manual_uprn_map=MANUAL_UPRN_MAP,
             epc_api_only=False
         )
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
index eaba1058..9852cc0d 100644
--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -330,7 +330,8 @@ class RetrieveFindMyEpc:
                 "roomstat_programmer_trvs", "time_temperature_zone_control"
             ],
             "Replacement warm air unit": [],
-            "Secondary glazing": ["secondary_glazing"]
+            "Secondary glazing": ["secondary_glazing"],
+            "Condensing heating unit": ["boiler_upgrade"],
         }
 
         survey = True

From 7b4218299ff1c3b108d3259cecb7fee13f4d1096 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 24 Feb 2025 12:11:47 +0000
Subject: [PATCH 64/72] adding work reasons

---
 asset_list/AssetList.py | 78 ++++++++++++++++++++++++++++++++++++-----
 asset_list/app.py       | 37 ++++++++++---------
 2 files changed, 91 insertions(+), 24 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 2d224daa..54f6cd96 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -296,7 +296,7 @@ class AssetList:
     ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area"
     ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness"
     ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below"
-    ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}"
+    ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"epc_is_pre_{EPC_YEAR_THRESHOLD}"
 
     # These are the descriptions that we look for in the EPC data that are indicative of no insulation
     EPC_NO_WALL_INSULATION_DESCRIPTIONS = [
@@ -775,7 +775,7 @@ class AssetList:
                         self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
                         "epc_year_lower_bound": None,
                         "epc_year_upper_bound": None,
-                        "Does Age Match EPC Age Band?": "No EPC Age Band"
+                        "does_age_band_match_epc_age_band": "No EPC Age Band"
                     }
                 )
                 continue
@@ -800,7 +800,7 @@ class AssetList:
                         self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
                         "epc_year_lower_bound": year_lower_bound,
                         "epc_year_upper_bound": None,
-                        "Does Age Match EPC Age Band?": age_band_matches
+                        "does_age_band_match_epc_age_band": age_band_matches
                     }
                 )
                 continue
@@ -820,7 +820,7 @@ class AssetList:
                         self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
                         "epc_year_lower_bound": None,
                         "epc_year_upper_bound": 1899,
-                        "Does Age Match EPC Age Band?": age_band_matches
+                        "does_age_band_match_epc_age_band": age_band_matches
                     }
                 )
                 continue
@@ -842,7 +842,7 @@ class AssetList:
                         self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
                         "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
                         "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
-                        "Does Age Match EPC Age Band?": age_band_matches
+                        "does_age_band_match_epc_age_band": age_band_matches
                     }
                 )
                 continue
@@ -864,7 +864,7 @@ class AssetList:
                     self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
                     "epc_year_lower_bound": int(lower_date),
                     "epc_year_upper_bound": int(upper_date),
-                    "Does Age Match EPC Age Band?": age_band_matches
+                    "does_age_band_match_epc_age_band": age_band_matches
                 }
             )
 
@@ -892,7 +892,12 @@ class AssetList:
                 (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
                 (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
                 self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) &
-                (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000)
+                (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) &
+                (
+                    self.standardised_asset_list[
+                        self.EPC_API_DATA_NAMES["current-energy-efficiency"]
+                    ] <= self.EMPTY_CAVITY_SAP_THRESHOLD
+                )
             )
 
             self.standardised_asset_list["epc_indicates_empty_cavity"] = (
@@ -1206,6 +1211,11 @@ class AssetList:
                 self.standardised_asset_list["solar_epc_floor_is_other_insulated"]
             )
 
+            # Drop anything we don't need
+            self.standardised_asset_list = self.standardised_asset_list.drop(
+                columns=["walls_u_value", "roof_u_value", "floor_u_value"]
+            )
+
         # Produce some aggregate figures
         self.work_type_figures = {
             # Empty cavity from non-intrusives
@@ -1219,7 +1229,11 @@ class AssetList:
                 ).sum()
             ),
             "Cavity Extraction": (
-                self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"].sum()
+                (
+                    ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
+                    ~self.standardised_asset_list["epc_indicates_empty_cavity"] &
+                    self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"]
+                ).sum()
             ),
             "Solar PV (Solid Floor)": (
                 self.standardised_asset_list["solar_eligible_solid_floor"].sum()
@@ -1234,3 +1248,51 @@ class AssetList:
                 self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum()
             )
         }
+
+        # Finally, we note why each property has been flagged
+        self.standardised_asset_list["cavity_reason"] = None
+        self.standardised_asset_list["cavity_reason"] = np.where(
+            self.standardised_asset_list["non_intrusive_indicates_empty_cavity"],
+            "Non-Intrusive Data Showed Empty Cavity",
+            self.standardised_asset_list["cavity_reason"]
+        )
+        self.standardised_asset_list["cavity_reason"] = np.where(
+            (
+                self.standardised_asset_list["epc_indicates_empty_cavity"] &
+                ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]
+            ),
+            "EPC Data Showed Empty Cavity",
+            self.standardised_asset_list["cavity_reason"]
+        )
+        # Flag extraction
+        self.standardised_asset_list["cavity_reason"] = np.where(
+            (
+                self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] &
+                pd.isnull(self.standardised_asset_list["cavity_reason"])
+            ),
+            "Non-Intrusive Data Showed Cavity Extraction",
+            self.standardised_asset_list["cavity_reason"]
+        )
+
+        # Flag solar
+        self.standardised_asset_list["solar_reason"] = None
+        self.standardised_asset_list["solar_reason"] = np.where(
+            self.standardised_asset_list["solar_eligible_solid_floor"],
+            "Solid Floor, Insulated, No Solar",
+            self.standardised_asset_list["solar_reason"]
+        )
+        self.standardised_asset_list["solar_reason"] = np.where(
+            self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"],
+            "Solid Floor, Insulated, Needs Loft",
+            self.standardised_asset_list["solar_reason"]
+        )
+        self.standardised_asset_list["solar_reason"] = np.where(
+            self.standardised_asset_list["solar_eligible_other_floor"],
+            "Other Floor, Insulated, No Solar",
+            self.standardised_asset_list["solar_reason"]
+        )
+        self.standardised_asset_list["solar_reason"] = np.where(
+            self.standardised_asset_list["solar_eligible_other_floor_needs_loft"],
+            "Other Floor, Insulated, Needs Loft",
+            self.standardised_asset_list["solar_reason"]
+        )
diff --git a/asset_list/app.py b/asset_list/app.py
index 3c1ab627..65d4ab87 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -376,6 +376,7 @@ def app():
         epc_data.append(csv_data)
 
     epc_df = pd.concat(epc_data)
+    epc_df["estimated"] = epc_df["estimated"].fillna(False)
 
     # We expand out the recommendations
     recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
@@ -454,36 +455,40 @@ def app():
 
     asset_list.identify_worktypes(cleaned)
 
+    from pprint import pprint
+    pprint(asset_list.work_type_figures)
+
     # TODO: We should do this breakdown for flats
     def flat_analysis(asset_list):
 
         # We need to deduce the building name - we strip out the house number
-        def extract_building_name(x):
-            # TODO: This doesn't really work
-            if pd.isnull(x):
-                return None
-            house_no = SearchEpc.get_house_number(address=x, postcode=None)
-            if house_no:
-                return x.replace(house_no, "").strip()
-            return x.split(",")[0].strip()
 
         # We want to deduce if flats have 50% of the properties below C75
         # We group by postcode and property type
-        grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"])
+        grouped = asset_list.standardised_asset_list.groupby(
+            [asset_list.STANDARD_POSTCODE, asset_list.STANDARD_PROPERTY_TYPE]
+        )
 
         flat_data = []
         for _, group in grouped:
-            if "flat" in group["Property Type"].str.lower().values:
-                num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0)
-                num_below_c75 = group["SAP score on register"].lt(75).sum()
+            if "flat" in group[asset_list.STANDARD_PROPERTY_TYPE].values:
+                num_flats = group[asset_list.STANDARD_PROPERTY_TYPE].shape[0]
+                num_below_c75 = group[
+                    asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"]
+                ].lt(asset_list.FILLED_CAVITY_SAP_THRESHOLD).sum()
+                # Check if any flats are below C69
+                num_flats_below_c69 = group[
+                    asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"]
+                ].lt(69).sum()
 
                 flat_data.append(
                     {
-                        "Postcode": group[POSTCODE_COLUMN].iloc[0],
+                        "Postcode": group[asset_list.STANDARD_POSTCODE].iloc[0],
                         "Property Type": "Flat",
                         "Number of Flats with EPC": num_flats,
                         "Number of Flats below C75": num_below_c75,
-                        "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats)
+                        "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats),
+                        "num_flats_below_c69": num_flats_below_c69,
                     }
                 )
 
@@ -494,11 +499,11 @@ def app():
     flat_data = flat_analysis(asset_list)
 
     # Store as an excel
-    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
+    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx"
     # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
 
     with pd.ExcelWriter(filename) as writer:
-        asset_list.to_excel(writer, sheet_name="EPC Data", index=False)
+        asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False)
         flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
 
     matches_review = asset_list[

From 99a0948e2bd3ab14197821a694cbf1d2383baff3 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 24 Feb 2025 16:11:02 +0000
Subject: [PATCH 65/72] getting ready to work on the colchester data

---
 asset_list/AssetList.py | 82 ++++++++++++++++++++++++++++++++--------
 asset_list/app.py       | 83 ++++++-----------------------------------
 2 files changed, 78 insertions(+), 87 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 54f6cd96..2b80287c 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -343,6 +343,7 @@ class AssetList:
         self.standardised_asset_list = self.raw_asset_list.copy()
         # Will be used to store aggregated figures against the various work types
         self.work_type_figures = {}
+        self.flat_data = None
 
         # We detect the presence of the non-intrusive columns
         self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
@@ -649,6 +650,9 @@ class AssetList:
         logger.info("Applying standardisation to asset list")
 
         for variable, mapping in self.variable_mappings.items():
+            self.standardised_asset_list[variable + "_original_from_landlord"] = (
+                self.standardised_asset_list[variable].copy()
+            )
             self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
 
         if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
@@ -663,6 +667,12 @@ class AssetList:
 
         # Apply renames to our standard names
         # Perform final variable selection and renaming:
+
+        # We add the original columns to the keep variables
+        self.keep_variables += [
+            k + "_original_from_landlord" for k in self.variable_mappings.keys()
+        ]
+
         self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename(
             columns=self.rename_map
         )
@@ -912,18 +922,6 @@ class AssetList:
                         self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD
                 )
             )
-
-            self.standardised_asset_list["empty_cavity"] = (
-                self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] |
-                self.standardised_asset_list["epc_indicates_empty_cavity"]
-            )
-            # We add a reason
-            self.standardised_asset_list["empty_cavity_reason"] = np.where(
-                self.standardised_asset_list["non_intrusive_indicates_empty_cavity"],
-                "Non-Intrusive Data",
-                "EPC Data"
-            )
-
             ######################################################
             # Extraction
             ######################################################
@@ -933,7 +931,7 @@ class AssetList:
             self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
                 (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
                 (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
-                (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"])
+                (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "COMPACTED BEAD"])
                  ) & (
                     self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
                 )
@@ -996,6 +994,12 @@ class AssetList:
                 )
             )
 
+            self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = (
+                self.standardised_asset_list["non-intrusives: Insulated"].isin(
+                    ["EWI", "RETRO DRILLED", "FILLED AT BUILD"]
+                )
+            )
+
             # TODO: We don't have information about the roof from this landlord
 
             # We merge on the u-value for average thermal transmittance
@@ -1146,7 +1150,8 @@ class AssetList:
                 # The walls are insulated
                 (
                     self.standardised_asset_list["solar_landlord_walls_insulated"] |
-                    self.standardised_asset_list["solar_epc_walls_insulated"]
+                    self.standardised_asset_list["solar_epc_walls_insulated"] |
+                    self.standardised_asset_list["solar_non_intrusives_walls_insulated"]
                 ) &
                 # Roof is insulated
                 self.standardised_asset_list["solar_epc_roof_insulated"] &
@@ -1165,7 +1170,8 @@ class AssetList:
                 # The walls are insulated
                 (
                     self.standardised_asset_list["solar_landlord_walls_insulated"] |
-                    self.standardised_asset_list["solar_epc_walls_insulated"]
+                    self.standardised_asset_list["solar_epc_walls_insulated"] |
+                    self.standardised_asset_list["solar_non_intrusives_walls_insulated"]
                 ) &
                 # Roof is insulated
                 self.standardised_asset_list["solar_epc_loft_needs_topup"] &
@@ -1216,6 +1222,15 @@ class AssetList:
                 columns=["walls_u_value", "roof_u_value", "floor_u_value"]
             )
 
+            # Adjust flagged extraction jobs to remove anything for solar
+            self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
+                self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] &
+                ~self.standardised_asset_list["solar_eligible_solid_floor"] &
+                ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"]
+                # ~self.standardised_asset_list["solar_eligible_other_floor"] &
+                # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"]
+            )
+
         # Produce some aggregate figures
         self.work_type_figures = {
             # Empty cavity from non-intrusives
@@ -1296,3 +1311,40 @@ class AssetList:
             "Other Floor, Insulated, Needs Loft",
             self.standardised_asset_list["solar_reason"]
         )
+
+    def flat_analysis(self):
+
+        # We need to deduce the building name - we strip out the house number
+
+        # We want to deduce if flats have 50% of the properties below C75
+        # We group by postcode and property type
+        grouped = self.standardised_asset_list.groupby(
+            [self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE]
+        )
+
+        flat_data = []
+        for _, group in grouped:
+            if "flat" in group[self.STANDARD_PROPERTY_TYPE].values:
+                num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0]
+                num_below_c75 = group[
+                    self.EPC_API_DATA_NAMES["current-energy-efficiency"]
+                ].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum()
+                # Check if any flats are below C69
+                num_flats_below_c69 = group[
+                    self.EPC_API_DATA_NAMES["current-energy-efficiency"]
+                ].lt(69).sum()
+
+                flat_data.append(
+                    {
+                        "Postcode": group[self.STANDARD_POSTCODE].iloc[0],
+                        "Property Type": "Flat",
+                        "Number of Flats with EPC": num_flats,
+                        "Number of Flats below C75": num_below_c75,
+                        "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats),
+                        "Number of Flats Below C69": num_flats_below_c69,
+                    }
+                )
+
+        flat_data = pd.DataFrame(flat_data)
+
+        self.flat_data = flat_data
diff --git a/asset_list/app.py b/asset_list/app.py
index 65d4ab87..f164e94e 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -4,6 +4,7 @@ import json
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
+from pprint import pprint
 import msgpack
 from utils.s3 import read_from_s3
 from asset_list.AssetList import AssetList
@@ -239,23 +240,18 @@ def app():
     # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
     # - Or the insulation required is loft/cavity (floors should be solid)
 
-    # For Westward
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
-    DATA_FILENAME = "WESTWARD - completed list..xlsx"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+    DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
     SHEET_NAME = "Sheet1"
-
-    POSTCODE_COLUMN = "WFT EDIT Postcode"
-    FULLADDRESS_COLUMN = "Address"
+    POSTCODE_COLUMN = 'Full Address.1'
+    FULLADDRESS_COLUMN = "Full Address"
     ADDRESS1_COLUMN = None
-    ADDRESS1_METHOD = "house_number_extraction"
-
+    ADDRESS1_METHOD = "first_word"
     ADDRESS_COLS_TO_CONCAT = []
     MISSING_POSTCODES_METHOD = None
-    PROPERTY_YEAR_BUILT = "Build date"
-    UPRN_COLUMN = "UPRN"
-    # If we have the non-intrusives data, this should be true
-    HAS_NON_INTRUSIVES = True
-    PROPERTY_TYPE_COLUMN = "Location type"  # This will be used to identify and remove bedsits
+    PROPERTY_YEAR_BUILT = "Build Date"
+    UPRN_COLUMN = None
+    PROPERTY_TYPE_COLUMN = None
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
@@ -297,20 +293,6 @@ def app():
 
     asset_list.apply_standardiation()
 
-    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
-    # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
-    # SHEET_NAME = "Sheet1"
-    # POSTCODE_COLUMN = 'Full Address.1'
-    # FULLADDRESS_COLUMN = "Full Address"
-    # ADDRESS1_COLUMN = None
-    # ADDRESS1_METHOD = "first_word"
-    # ADDRESS_COLS_TO_CONCAT = []
-    # MISSING_POSTCODES_METHOD = None
-    # PROPERTY_YEAR_BUILT = "Build Date"
-    # UPRN_COLUMN = None
-    # # If we have the non-intrusives data, this should be true
-    # HAS_NON_INTRUSIVES = True
-
     ### We retrieve the EPC data
 
     # We chunk up this data into 5000 rows at a time
@@ -455,48 +437,9 @@ def app():
 
     asset_list.identify_worktypes(cleaned)
 
-    from pprint import pprint
     pprint(asset_list.work_type_figures)
 
-    # TODO: We should do this breakdown for flats
-    def flat_analysis(asset_list):
-
-        # We need to deduce the building name - we strip out the house number
-
-        # We want to deduce if flats have 50% of the properties below C75
-        # We group by postcode and property type
-        grouped = asset_list.standardised_asset_list.groupby(
-            [asset_list.STANDARD_POSTCODE, asset_list.STANDARD_PROPERTY_TYPE]
-        )
-
-        flat_data = []
-        for _, group in grouped:
-            if "flat" in group[asset_list.STANDARD_PROPERTY_TYPE].values:
-                num_flats = group[asset_list.STANDARD_PROPERTY_TYPE].shape[0]
-                num_below_c75 = group[
-                    asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"]
-                ].lt(asset_list.FILLED_CAVITY_SAP_THRESHOLD).sum()
-                # Check if any flats are below C69
-                num_flats_below_c69 = group[
-                    asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"]
-                ].lt(69).sum()
-
-                flat_data.append(
-                    {
-                        "Postcode": group[asset_list.STANDARD_POSTCODE].iloc[0],
-                        "Property Type": "Flat",
-                        "Number of Flats with EPC": num_flats,
-                        "Number of Flats below C75": num_below_c75,
-                        "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats),
-                        "num_flats_below_c69": num_flats_below_c69,
-                    }
-                )
-
-        flat_data = pd.DataFrame(flat_data)
-
-        return flat_data
-
-    flat_data = flat_analysis(asset_list)
+    asset_list.flat_analysis()
 
     # Store as an excel
     filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx"
@@ -504,8 +447,4 @@ def app():
 
     with pd.ExcelWriter(filename) as writer:
         asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False)
-        flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
-
-    matches_review = asset_list[
-        [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
-    ]
+        asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False)

From 5391afeaaaa024ff7b1a54fc18f565b9c46a3925 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 24 Feb 2025 16:52:42 +0000
Subject: [PATCH 66/72] handling the case of landlord property id being missing

---
 asset_list/AssetList.py |  2 +-
 asset_list/app.py       | 58 ++++++++++++++++++++++-------------------
 2 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 2b80287c..c2784eb1 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -378,7 +378,7 @@ class AssetList:
         self.keep_variables = []
 
         # Finally, we handle the case where the landlord's property ID is actually the OS UPRN
-        if self.landlord_uprn == self.landlord_property_id:
+        if (self.landlord_uprn == self.landlord_property_id) and (self.landlord_property_id is not None):
             self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy()
             # Update the reference to landlord UPRn
             self.landlord_uprn = self.STANDARD_UPRN
diff --git a/asset_list/app.py b/asset_list/app.py
index f164e94e..89b15c06 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -240,39 +240,43 @@ def app():
     # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
     # - Or the insulation required is loft/cavity (floors should be solid)
 
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
-    DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
-    SHEET_NAME = "Sheet1"
-    POSTCODE_COLUMN = 'Full Address.1'
-    FULLADDRESS_COLUMN = "Full Address"
-    ADDRESS1_COLUMN = None
-    ADDRESS1_METHOD = "first_word"
-    ADDRESS_COLS_TO_CONCAT = []
-    MISSING_POSTCODES_METHOD = None
-    PROPERTY_YEAR_BUILT = "Build Date"
-    UPRN_COLUMN = None
-    PROPERTY_TYPE_COLUMN = None
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+    data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+    sheet_name = "Sheet1"
+    postcode_column = 'Full Address.1'
+    fulladdress_column = "Full Address"
+    address1_column = None
+    address1_method = "first_word"
+    address_cols_to_concat = []
+    missing_postcodes_method = None
+    landlord_year_built = "Build Date"
+    landlord_os_uprn = None
+    landlord_property_type = "Property Type"
+    landlord_wall_construction = "Wallinsul"
+    landlord_heating_system = "HeatSorc"
+    landlord_existing_pv = None
+    landlord_property_id = None
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
 
     asset_list = AssetList(
-        local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
+        local_filepath=os.path.join(data_folder, data_filename),
         header=0,
-        sheet_name=SHEET_NAME,
-        address1_colname=ADDRESS1_COLUMN,
-        postcode_colname=POSTCODE_COLUMN,
-        landlord_property_id="UPRN",
-        full_address_colname=FULLADDRESS_COLUMN,
-        full_address_cols_to_concat=ADDRESS_COLS_TO_CONCAT,
-        missing_postcodes_method=MISSING_POSTCODES_METHOD,
-        address1_extraction_method=ADDRESS1_METHOD,
-        landlord_year_built=PROPERTY_YEAR_BUILT,
-        landlord_uprn=UPRN_COLUMN,
-        landlord_property_type=PROPERTY_TYPE_COLUMN,
-        landlord_wall_construction="Wall Construction (EPC)",
-        landlord_heating_system="Heat Source",
-        landlord_existing_pv="PV (Y/N)"
+        sheet_name=sheet_name,
+        address1_colname=address1_column,
+        postcode_colname=postcode_column,
+        landlord_property_id=landlord_property_id,
+        full_address_colname=fulladdress_column,
+        full_address_cols_to_concat=address_cols_to_concat,
+        missing_postcodes_method=missing_postcodes_method,
+        address1_extraction_method=address1_method,
+        landlord_year_built=landlord_year_built,
+        landlord_uprn=landlord_os_uprn,
+        landlord_property_type=landlord_property_type,
+        landlord_wall_construction=landlord_wall_construction,
+        landlord_heating_system=landlord_heating_system,
+        landlord_existing_pv=landlord_existing_pv
     )
     asset_list.init_standardise()
 

From 8fa8307e33dc27793815eccadbb11fa3a28d1c68 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 24 Feb 2025 18:36:00 +0000
Subject: [PATCH 67/72] ai mappings

---
 asset_list/AssetList.py                | 32 ++++++++++++++++++++++-
 asset_list/app.py                      |  2 +-
 asset_list/mappings/heating_systems.py | 35 ++++++++++++++++++++------
 asset_list/mappings/property_type.py   |  9 ++++++-
 asset_list/mappings/walls.py           | 13 +++++++---
 5 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index c2784eb1..06ec5907 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -598,7 +598,35 @@ class AssetList:
                     self.standardised_asset_list[self.landlord_year_built].dt.year
                 )
             else:
-                raise NotImplementedError("Year built column must be a datetime - implement me")
+                # We attempt to convert the year built to a datetime, by detecting the format and converting
+
+                def extract_year(date_str):
+                    """
+                    Extracts the year from a date string in the format '01-Jul-YYYY'.
+                    Returns the extracted year as an integer or None if the format is incorrect.
+                    """
+                    known_errors = ["#MULTIVALUE"]
+
+                    if pd.isnull(date_str) or date_str in known_errors:
+                        return None
+
+                    if isinstance(date_str, str):
+                        match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str)
+                        if match:
+                            return int(match.group(1))  # Extract the year and convert to integer
+
+                    if isinstance(date_str, datetime):
+                        return date_str.year
+
+                    # Check if date_str is a year itself
+                    if str(date_str).isdigit() & (len(str(date_str)) == 4):
+                        return int(date_str)
+
+                    raise NotImplementedError("Unhandled format for year built - implement me")
+
+                self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[
+                    self.landlord_year_built
+                ].apply(extract_year)
 
         # We now create standard lookups
         to_remap = {
@@ -619,6 +647,8 @@ class AssetList:
                 "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS
             }
         }
+        # Keep just entries where the key is not None
+        to_remap = {k: v for k, v in to_remap.items() if k is not None}
 
         for variable, config in to_remap.items():
             logger.info("Standardising variable: %s", variable)
diff --git a/asset_list/app.py b/asset_list/app.py
index 89b15c06..1cb7808e 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -255,7 +255,7 @@ def app():
     landlord_wall_construction = "Wallinsul"
     landlord_heating_system = "HeatSorc"
     landlord_existing_pv = None
-    landlord_property_id = None
+    landlord_property_id = "Property Reference"
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py
index 89bfe0c4..b58f13f2 100644
--- a/asset_list/mappings/heating_systems.py
+++ b/asset_list/mappings/heating_systems.py
@@ -1,3 +1,5 @@
+import numpy as np
+
 STANDARD_HEATING_SYSTEMS = {
     "gas combi boiler",
     "electric storage heaters",
@@ -35,12 +37,31 @@ HEATING_MAPPINGS = {
     "Eco Electric Radiators": "electric radiators",
     "Gas fire": "other",
     "Backboiler - Solid fuel": "other",
-    'combi - gas': 'gas combi boiler', 'e7 storage heaters': 'electric storage heaters',
-    'district heating system': 'district heating', 'condensing boiler - gas': 'gas condensing boiler',
-    'boiler oil/other': 'oil boiler', 'condensing combi - gas': 'gas condensing combi',
-    'air source source heat pump': 'air source heat pump', 'biomass boiler': 'boiler - other fuel',
-    'ground source heat pump': 'ground source heat pump', 'electric oil filled radiators': 'electric radiators',
-    'solid fuel': 'other', 'lpg boiler': 'boiler - other fuel', 'electric boiler': 'electric boiler',
+    'combi - gas': 'gas combi boiler',
+    'e7 storage heaters': 'electric storage heaters',
+    'district heating system': 'district heating',
+    'condensing boiler - gas': 'gas condensing boiler',
+    'boiler oil/other': 'oil boiler',
+    'condensing combi - gas': 'gas condensing combi',
+    'air source source heat pump': 'air source heat pump',
+    'biomass boiler': 'boiler - other fuel',
+    'ground source heat pump': 'ground source heat pump',
+    'electric oil filled radiators': 'electric radiators',
+    'solid fuel': 'other',
+    'lpg boiler': 'boiler - other fuel',
+    'electric boiler': 'electric boiler',
     'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler',
-    'eco electric radiators': 'electric radiators', 'gas fire': 'other', 'backboiler - solid fuel': 'other',
+    'eco electric radiators': 'electric radiators',
+    'gas fire': 'other', 'backboiler - solid fuel': 'other',
+    'ASHP': 'air source heat pump',
+    'COMMHEAT': 'communal gas boiler',
+    'GBB': 'gas combi boiler',
+    'GFS': 'gas condensing boiler',
+    'GWA': 'gas condensing boiler',
+    'GWM': 'gas condensing combi',
+    'HDU': 'district heating',
+    'OILBLR': 'oil boiler',
+    'SOLIDFUEL': 'boiler - other fuel',
+    'STORHTR': 'high heat retention storage heaters',
+    np.nan: 'unknown',
 }
diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py
index ec569123..2612f058 100644
--- a/asset_list/mappings/property_type.py
+++ b/asset_list/mappings/property_type.py
@@ -1,7 +1,7 @@
 # These are the standard categories for property types
 STANDARD_PROPERTY_TYPES = {
     "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house",
-    "unknown", "other"
+    "unknown", "other", "block of flats"
 }
 
 # This is a basic mapping that we use to map values that we've seen commonly to standard values
@@ -15,4 +15,11 @@ PROPERTY_MAPPING = {
     "BEDSIT": "bedsit",
     "COACHSE": "coach house",
     "coachse": "coach house",
+    'Admin Unit Type': 'unknown',
+    'Block': 'block of flats',
+    'Bungalow': 'bungalow',
+    'Flat': 'flat',
+    'House': 'house',
+    'Maisonette': 'maisonette',
+    'Stairwell': 'other'
 }
diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py
index 1fc52fcb..82b31d01 100644
--- a/asset_list/mappings/walls.py
+++ b/asset_list/mappings/walls.py
@@ -1,7 +1,8 @@
 STANDARD_WALL_CONSTRUCTIONS = {
     "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation",
-    "timber frame", "uninsulated solid brick",
-    "insulated solid brick", "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone",
+    "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation",
+    "timber frame",
+    "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone",
     "cob",
     "new build - average thermal transmittance",
 }
@@ -70,8 +71,7 @@ WALL_CONSTRUCTION_MAPPINGS = {
     'average thermal transmittance 0.28 w/m?k': 'unknown',
     'Cavity wall, filled cavity': 'filled cavity',
     'Cavity wall, filled cavity and external insulation': 'filled cavity',
-    'Granite or whinstone, as built, no insulation (assumed)': 'granite or '
-                                                               'whinstone',
+    'Granite or whinstone, as built, no insulation (assumed)': 'granite or whinstone',
     'Solid brick, as built, insulated (assumed)': 'insulated solid brick',
     'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick',
     'Solid brick, with external insulation': 'insulated solid brick',
@@ -84,4 +84,9 @@ WALL_CONSTRUCTION_MAPPINGS = {
     'Timber frame, as built, no insulation (assumed)': 'timber frame',
     'Timber frame, as built, partial insulation (assumed)': 'timber frame',
     'Timber frame, with additional insulation': 'timber frame',
+    'CAVITY': 'partial unknown cavity',
+    'COMB': 'unknown',
+    'NONE': 'unknown',
+    'NOTKNOWN': 'unknown',
+    'SOLID': 'solid brick unknown insulation',
 }

From c3049732f0d680a38aa9acacd3f15ff9e16d80f0 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 24 Feb 2025 18:44:06 +0000
Subject: [PATCH 68/72] handling block of flats

---
 asset_list/AssetList.py                |  7 +++++++
 asset_list/app.py                      | 25 ++++++++++++++++---------
 asset_list/mappings/heating_systems.py |  2 +-
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 06ec5907..72086c60 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -344,6 +344,7 @@ class AssetList:
         # Will be used to store aggregated figures against the various work types
         self.work_type_figures = {}
         self.flat_data = None
+        self.duplicated_addresses = None
 
         # We detect the presence of the non-intrusive columns
         self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
@@ -691,6 +692,12 @@ class AssetList:
                 f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated "
                 f"addresses - dropping"
             )
+
+            # Keep a record of duplicates
+            self.duplicated_addresses = self.standardised_asset_list[
+                self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
+            ][[self.DOMNA_PROPERTY_ID, self.address1_colname, self.postcode_colname]].copy()
+
             self.standardised_asset_list = self.standardised_asset_list[
                 ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
             ]
diff --git a/asset_list/app.py b/asset_list/app.py
index 1cb7808e..a24c4043 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -45,6 +45,12 @@ def get_data(
     no_epc = []
     for _, home in tqdm(df.iterrows(), total=len(df)):
         try:
+
+            # If we have a block of flats, we cannot retrieve this data
+            if home[AssetList.STANDARD_PROPERTY_TYPE] == "block of flats":
+                no_epc.append(home[row_id_name])
+                continue
+
             postcode = home[postcode_column]
             house_number = str(home[address1_column]).strip()
             full_address = home[fulladdress_column].strip()
@@ -283,16 +289,17 @@ def app():
     # We produce the new maps, which can be saved for future useage
 
     new_property_type_map = PROPERTY_MAPPING.copy().update(
-        asset_list.variable_mappings[asset_list.landlord_property_type]
+        asset_list.variable_mappings[asset_list.landlord_property_type] if asset_list.landlord_property_type else {}
     )
     new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update(
-        asset_list.variable_mappings[asset_list.landlord_wall_construction]
+        asset_list.variable_mappings[asset_list.landlord_wall_construction] if
+        asset_list.landlord_wall_construction else {}
     )
     new_heating_map = HEATING_MAPPINGS.copy().update(
-        asset_list.variable_mappings[asset_list.landlord_heating_system]
+        asset_list.variable_mappings[asset_list.landlord_heating_system] if asset_list.landlord_heating_system else {}
     )
     new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update(
-        asset_list.variable_mappings[asset_list.landlord_existing_pv]
+        asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {}
     )
 
     asset_list.apply_standardiation()
@@ -305,7 +312,7 @@ def app():
     skip = None  # Used to skip already completed chunks
     chunk_size = 5000
     filename = "Chunk {i}.csv"
-    download_folder = os.path.join(DATA_FOLDER, "Chunks")
+    download_folder = os.path.join(data_folder, "Chunks")
     if not os.path.exists(download_folder):
         os.makedirs(download_folder)
 
@@ -343,12 +350,12 @@ def app():
 
         # Append the failed data to the main data
         # Store the chunk locally as a csv
-        pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)
+        pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False)
         # Store the errors and no-data locally
-        with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} errors.json"), "w") as f:
+        with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f:
             json.dump(errors_chunk, f)
 
-        with open(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i} nodata.csv"), "w") as f:
+        with open(os.path.join(data_folder, f"Chunks/Chunk {i} nodata.csv"), "w") as f:
             json.dump(no_epc_chunk, f)
 
     # We read in and concatenate the created created chunks
@@ -446,7 +453,7 @@ def app():
     asset_list.flat_analysis()
 
     # Store as an excel
-    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx"
+    filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx"
     # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
 
     with pd.ExcelWriter(filename) as writer:
diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py
index b58f13f2..4879efcc 100644
--- a/asset_list/mappings/heating_systems.py
+++ b/asset_list/mappings/heating_systems.py
@@ -62,6 +62,6 @@ HEATING_MAPPINGS = {
     'HDU': 'district heating',
     'OILBLR': 'oil boiler',
     'SOLIDFUEL': 'boiler - other fuel',
-    'STORHTR': 'high heat retention storage heaters',
+    'STORHTR': 'electric storage heaters',
     np.nan: 'unknown',
 }

From 0ffc59861c4d70a822c0830838bc740a2598331f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 25 Feb 2025 08:19:08 +0000
Subject: [PATCH 69/72] examining results on colchester

---
 asset_list/AssetList.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 72086c60..0156a2a3 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -233,7 +233,8 @@ class AssetList:
         "secondheat-description": "epc_secondary_heating",
         "transaction-type": "epc_reason",
         "energy-consumption-current": "epc_heat_demand",
-        "photo-supply": "epc_photo_supply"
+        "photo-supply": "epc_photo_supply",
+        "estimated": "estimated"
     }
     FIND_EPC_DATA_NAMES = {
         "heating_text": "epc_estiamted_heating_kwh",
@@ -714,6 +715,22 @@ class AssetList:
             columns=self.rename_map
         )
 
+        # We fill any standard columns that are not in the data because they were not provided by the landlord
+        missing_variables = [
+            v for v in [
+                self.STANDARD_EXISTING_PV,
+                self.STANDARD_HEATING_SYSTEM,
+                self.STANDARD_UPRN,
+                self.STANDARD_PROPERTY_TYPE,
+                self.STANDARD_YEAR_BUILT,
+                self.STANDARD_WALL_CONSTRUCTION,
+                self.STANDARD_HEATING_SYSTEM,
+                self.STANDARD_EXISTING_PV
+            ] if v not in self.standardised_asset_list.columns
+        ]
+        for v in missing_variables:
+            self.standardised_asset_list[v] = None
+
     def merge_data(self, df: pd.DataFrame):
         """
         Used to insert data into the standardised asset list, based on the domna property id
@@ -963,7 +980,6 @@ class AssetList:
             # Extraction
             ######################################################
 
-            # TODO When filterting like this, 627 properties are flagged as not needing a CIGA check and 582 are flagged
             # as needing a CIGA check. What is the logic we should be applying here?
             self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
                 (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
@@ -974,6 +990,15 @@ class AssetList:
                 )
             )
 
+            z = self.standardised_asset_list[
+                self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES"
+                ]
+            z["non-intrusives: Insulated"].value_counts()
+            z["non-intrusives: Material"].value_counts()
+            z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts()
+            z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max()
+            zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105]
+
             ######################################################
             # Solar
             ######################################################

From 67f3e8ab703ea2893cdb9f9a6a9bd7bbee9344f8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 25 Feb 2025 08:41:08 +0000
Subject: [PATCH 70/72] reviewing methodology

---
 asset_list/AssetList.py | 51 +++++++++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 0156a2a3..76f2b145 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -951,7 +951,7 @@ class AssetList:
             ######################################################
             # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled
             # 2) The age is before 1995
-            # TODO: 3) Remove anything that likley has access issues
+            # 3) We don't remove anything that haas access issues yet
             self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = (
                 (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
                 (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
@@ -976,6 +976,19 @@ class AssetList:
                         self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD
                 )
             )
+
+            z0 = self.standardised_asset_list[
+                self.standardised_asset_list["epc_indicates_empty_cavity"] & (
+                    ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]
+                )
+                ]
+            z0['non-intrusives: Construction'].value_counts()
+            z0['non-intrusives: Insulated'].value_counts()
+            z00 = z0[z0['non-intrusives: Insulated'] == "EWI"]
+
+            # If the EPC is estimated, perhaps we should defer to the non-intrusives?
+            z00[""]
+
             ######################################################
             # Extraction
             ######################################################
@@ -990,14 +1003,26 @@ class AssetList:
                 )
             )
 
-            z = self.standardised_asset_list[
-                self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES"
-                ]
-            z["non-intrusives: Insulated"].value_counts()
-            z["non-intrusives: Material"].value_counts()
-            z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts()
-            z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max()
-            zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105]
+            # z3 = self.standardised_asset_list[
+            #     self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"]
+            # ]
+            # z3['non-intrusives: Material'].value_counts()
+            # self.standardised_asset_list['non-intrusives: Material'].value_counts()
+            #
+            # z = self.standardised_asset_list[
+            #     self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES"
+            #     ]
+            # z["non-intrusives: Insulated"].value_counts()
+            # z["non-intrusives: Material"].value_counts()
+            # z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts()
+            # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max()
+            # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].min()
+            # z[self.STANDARD_YEAR_BUILT].describe()
+            #
+            # zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105]
+            # z2 = self.standardised_asset_list[
+            #     self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "NO"
+            #     ]
 
             ######################################################
             # Solar
@@ -1159,6 +1184,10 @@ class AssetList:
                         .lower().str.contains("solid")
                     ) & (
                         ~self.standardised_asset_list["epc_has_floor_recommendation"]
+                    ) & (
+                        # We do not utilise estimated EPCs for this method because we will always find that
+                        # "epc_has_floor_recommendation" is False
+                        ~self.standardised_asset_list["estimated"]
                     )
                 ) | (
                     (
@@ -1180,6 +1209,10 @@ class AssetList:
                         .lower().str.contains("suspended")
                     ) & (
                         ~self.standardised_asset_list["epc_has_floor_recommendation"]
+                    ) & (
+                        # We do not utilise estimated EPCs for this method because we will always find that
+                        # "epc_has_floor_recommendation" is False
+                        ~self.standardised_asset_list["estimated"]
                     )
                 ) | (
                     (

From ddfbf33494f6741b974217fffc5bb4ba784560a0 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 26 Feb 2025 11:00:12 +0000
Subject: [PATCH 71/72] westward complete

---
 asset_list/AssetList.py                 | 95 ++++++++++++++-----------
 asset_list/app.py                       | 42 +++++++----
 asset_list/mappings/walls.py            |  2 +-
 etl/customers/remote_assessments/app.py | 14 ++--
 recommendations/HeatingRecommender.py   |  2 +-
 5 files changed, 94 insertions(+), 61 deletions(-)

diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 76f2b145..31b11c66 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -286,7 +286,7 @@ class AssetList:
     # This SAP threshold is a key search criteria for properties that may be eligible for extraction
     FILLED_CAVITY_SAP_THRESHOLD = 75
     # This SAP the
-    EMPTY_CAVITY_SAP_THRESHOLD = 71
+    EMPTY_CAVITY_SAP_THRESHOLD = 75
     # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable
     EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5
 
@@ -956,13 +956,28 @@ class AssetList:
                 (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
                 (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
                 self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) &
-                (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) &
+                (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) &
                 (
                     self.standardised_asset_list[
                         self.EPC_API_DATA_NAMES["current-energy-efficiency"]
                     ] <= self.EMPTY_CAVITY_SAP_THRESHOLD
                 )
             )
+            # Let's also flag work that looks eligible without the SAP filter
+            self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = (
+                (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
+                (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
+                self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) &
+                (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002)
+            )
+
+            # If non_intrusive_indicates_empty_cavity is True,
+            # set non_intrusive_indicates_empty_cavity_no_sap_filter to False
+            self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = np.where(
+                self.standardised_asset_list["non_intrusive_indicates_empty_cavity"],
+                False,
+                self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"]
+            )
 
             self.standardised_asset_list["epc_indicates_empty_cavity"] = (
                 self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin(
@@ -977,17 +992,16 @@ class AssetList:
                 )
             )
 
-            z0 = self.standardised_asset_list[
-                self.standardised_asset_list["epc_indicates_empty_cavity"] & (
-                    ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]
-                )
-                ]
-            z0['non-intrusives: Construction'].value_counts()
-            z0['non-intrusives: Insulated'].value_counts()
-            z00 = z0[z0['non-intrusives: Insulated'] == "EWI"]
-
-            # If the EPC is estimated, perhaps we should defer to the non-intrusives?
-            z00[""]
+            # If the EPC is esimtated, we defer to the non-intrusives
+            self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where(
+                (
+                    self.standardised_asset_list["epc_indicates_empty_cavity"] &
+                    ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
+                    self.standardised_asset_list["estimated"]
+                ),
+                False,
+                self.standardised_asset_list["epc_indicates_empty_cavity"]
+            )
 
             ######################################################
             # Extraction
@@ -997,33 +1011,14 @@ class AssetList:
             self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
                 (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
                 (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
-                (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "COMPACTED BEAD"])
+                (~self.standardised_asset_list['non-intrusives: Material'].isin(
+                    ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"]
+                )
                  ) & (
                     self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
                 )
             )
 
-            # z3 = self.standardised_asset_list[
-            #     self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"]
-            # ]
-            # z3['non-intrusives: Material'].value_counts()
-            # self.standardised_asset_list['non-intrusives: Material'].value_counts()
-            #
-            # z = self.standardised_asset_list[
-            #     self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "YES"
-            #     ]
-            # z["non-intrusives: Insulated"].value_counts()
-            # z["non-intrusives: Material"].value_counts()
-            # z[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW].value_counts()
-            # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].max()
-            # z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].min()
-            # z[self.STANDARD_YEAR_BUILT].describe()
-            #
-            # zz = z[z[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] == 105]
-            # z2 = self.standardised_asset_list[
-            #     self.standardised_asset_list["non-intrusives: CIGA Check Required"] == "NO"
-            #     ]
-
             ######################################################
             # Solar
             ######################################################
@@ -1114,7 +1109,7 @@ class AssetList:
                 ) | (
                     self.standardised_asset_list[
                         "walls_u_value"].apply(
-                        lambda x: x <= 0.3 if not pd.isnull(
+                        lambda x: x <= 0.7 if not pd.isnull(
                             x) else False
                     )
                 )
@@ -1141,7 +1136,7 @@ class AssetList:
                     "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False
                 ) | (
                     self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
-                        lambda x: int(x) >= 270 if str(x).isdigit() else False
+                        lambda x: int(x) >= 200 if str(x).isdigit() else False
                     )
                 ) | (
                     self.standardised_asset_list["roof_u_value"].apply(
@@ -1152,7 +1147,7 @@ class AssetList:
 
             self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[
                 self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
-                lambda x: int(x) < 270 if str(x).isdigit() else False
+                lambda x: int(x) < 200 if str(x).isdigit() else False
             )
 
             # TODO: Fill with False - should be temp!
@@ -1187,7 +1182,7 @@ class AssetList:
                     ) & (
                         # We do not utilise estimated EPCs for this method because we will always find that
                         # "epc_has_floor_recommendation" is False
-                        ~self.standardised_asset_list["estimated"]
+                        (self.standardised_asset_list["estimated"] == False)
                     )
                 ) | (
                     (
@@ -1212,7 +1207,7 @@ class AssetList:
                     ) & (
                         # We do not utilise estimated EPCs for this method because we will always find that
                         # "epc_has_floor_recommendation" is False
-                        ~self.standardised_asset_list["estimated"]
+                        self.standardised_asset_list["estimated"] == False
                     )
                 ) | (
                     (
@@ -1274,6 +1269,7 @@ class AssetList:
             )
 
             # Other floor type, fully insulated
+
             self.standardised_asset_list["solar_eligible_other_floor"] = (
                 # Landlord data or EPC data indicates the heating system is appropriate
                 (
@@ -1332,6 +1328,9 @@ class AssetList:
             "Empty Cavity (non-intrusives)": (
                 self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum()
             ),
+            "Empty Cavity (non-intrusives, no SAP filter)": (
+                self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum()
+            ),
             "Empty Cavity (EPC)": (
                 (
                     self.standardised_asset_list["epc_indicates_empty_cavity"] &
@@ -1359,6 +1358,17 @@ class AssetList:
             )
         }
 
+        # We produce a breakdown of the property types, for cavity fills
+        cavity_fills = self.standardised_asset_list[
+            self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | (
+                self.standardised_asset_list["epc_indicates_empty_cavity"]
+            )
+            ]
+
+        self.work_type_breakdowns = {
+            "empty_cavity": cavity_fills[self.STANDARD_PROPERTY_TYPE].value_counts()
+        }
+
         # Finally, we note why each property has been flagged
         self.standardised_asset_list["cavity_reason"] = None
         self.standardised_asset_list["cavity_reason"] = np.where(
@@ -1366,6 +1376,11 @@ class AssetList:
             "Non-Intrusive Data Showed Empty Cavity",
             self.standardised_asset_list["cavity_reason"]
         )
+        self.standardised_asset_list["cavity_reason"] = np.where(
+            self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"],
+            "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed",
+            self.standardised_asset_list["cavity_reason"]
+        )
         self.standardised_asset_list["cavity_reason"] = np.where(
             (
                 self.standardised_asset_list["epc_indicates_empty_cavity"] &
diff --git a/asset_list/app.py b/asset_list/app.py
index a24c4043..09ccac02 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -246,22 +246,40 @@ def app():
     # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
     # - Or the insulation required is loft/cavity (floors should be solid)
 
-    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
-    data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+    # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+    # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+    # sheet_name = "Sheet1"
+    # postcode_column = 'Full Address.1'
+    # fulladdress_column = "Full Address"
+    # address1_column = None
+    # address1_method = "first_word"
+    # address_cols_to_concat = []
+    # missing_postcodes_method = None
+    # landlord_year_built = "Build Date"
+    # landlord_os_uprn = None
+    # landlord_property_type = "Property Type"
+    # landlord_wall_construction = "Wallinsul"
+    # landlord_heating_system = "HeatSorc"
+    # landlord_existing_pv = None
+    # landlord_property_id = "Property Reference"
+
+    # For Westward
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
+    data_filename = "WESTWARD - completed list..xlsx"
     sheet_name = "Sheet1"
-    postcode_column = 'Full Address.1'
-    fulladdress_column = "Full Address"
+    postcode_column = "WFT EDIT Postcode"
+    fulladdress_column = "Address"
     address1_column = None
-    address1_method = "first_word"
+    address1_method = "house_number_extraction"
     address_cols_to_concat = []
     missing_postcodes_method = None
-    landlord_year_built = "Build Date"
-    landlord_os_uprn = None
-    landlord_property_type = "Property Type"
-    landlord_wall_construction = "Wallinsul"
-    landlord_heating_system = "HeatSorc"
-    landlord_existing_pv = None
-    landlord_property_id = "Property Reference"
+    landlord_year_built = "Build date"
+    landlord_os_uprn = "UPRN"
+    landlord_property_type = "Location type"
+    landlord_wall_construction = "Wall Construction (EPC)"
+    landlord_heating_system = "Heat Source"
+    landlord_existing_pv = "PV (Y/N)"
+    landlord_property_id = "Place ref"
 
     # Maps addresses to uprn in problematic cases
     MANUAL_UPRN_MAP = {}
diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py
index 82b31d01..78d64988 100644
--- a/asset_list/mappings/walls.py
+++ b/asset_list/mappings/walls.py
@@ -84,7 +84,7 @@ WALL_CONSTRUCTION_MAPPINGS = {
     'Timber frame, as built, no insulation (assumed)': 'timber frame',
     'Timber frame, as built, partial insulation (assumed)': 'timber frame',
     'Timber frame, with additional insulation': 'timber frame',
-    'CAVITY': 'partial unknown cavity',
+    'CAVITY': 'cavity unknown insulation',
     'COMB': 'unknown',
     'NONE': 'unknown',
     'NOTKNOWN': 'unknown',
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index 15f59c5e..aac0a1a6 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
 from utils.s3 import save_csv_to_s3
 from etl.find_my_epc.AssetListEpcData import AssetListEpcData
 
-PORTFOLIO_ID = 133
+PORTFOLIO_ID = 137
 USER_ID = 8
 
 load_dotenv(dotenv_path="backend/.env")
@@ -19,10 +19,10 @@ def app():
 
     asset_list = [
         {
-            "address": "40",
-            "postcode": "PE4 5BB",
-            "uprn": 100090220519,
-        }
+            "address": "41 Gainsborough Way",
+            "postcode": "BA21 5XU",
+            "uprn": 30016708,
+        },
     ]
     asset_list = pd.DataFrame(asset_list)
 
@@ -52,8 +52,8 @@ def app():
 
     valuation_data = [
         {
-            "uprn": 100090220519,
-            "valuation": 135_000
+            "uprn": 30016708,
+            "valuation": 189000
         }
     ]
     # Store valuation data to s3
diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py
index c5c07f89..dd81680a 100644
--- a/recommendations/HeatingRecommender.py
+++ b/recommendations/HeatingRecommender.py
@@ -993,7 +993,7 @@ class HeatingRecommender:
         # We check if there's a mains connection and the hot water is inefficient, as this will improve with a boiler
         has_inefficient_water = (
             self.property.data["mains-gas-flag"] and
-            self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"]
+            self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]
         )
 
         non_invasive_recommendation = next((

From bb8070967b3f0e8e0234fd07e0428acc9568d208 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 3 Mar 2025 14:38:01 +0000
Subject: [PATCH 72/72] big commit

---
 .idea/Model.iml                         |  2 +-
 .idea/misc.xml                          |  2 +-
 asset_list/AssetList.py                 | 76 ++++++++++++++++++++++---
 asset_list/app.py                       | 67 +++++++++++-----------
 backend/Funding.py                      | 12 ++--
 backend/app/plan/router.py              |  2 +-
 etl/customers/remote_assessments/app.py | 41 ++++++++++---
 etl/find_my_epc/AssetListEpcData.py     | 20 +++++--
 recommendations/HeatingRecommender.py   |  2 +
 9 files changed, 159 insertions(+), 65 deletions(-)

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 96ad7a95..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index fb10c6b0..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
index 31b11c66..306edd99 100644
--- a/asset_list/AssetList.py
+++ b/asset_list/AssetList.py
@@ -344,6 +344,7 @@ class AssetList:
         self.standardised_asset_list = self.raw_asset_list.copy()
         # Will be used to store aggregated figures against the various work types
         self.work_type_figures = {}
+        self.work_type_breakdowns = {}
         self.flat_data = None
         self.duplicated_addresses = None
 
@@ -577,7 +578,7 @@ class AssetList:
         self.standardised_asset_list[self.landlord_wall_construction] = np.where(
             self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains(
                 "average thermal transmittance"
-            ),
+            ) == True,
             "new build - average thermal transmittance",
             self.standardised_asset_list[self.landlord_wall_construction]
         )
@@ -1019,6 +1020,23 @@ class AssetList:
                 )
             )
 
+            # Also include work without the SAP filter as optimistic
+            self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = (
+                (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
+                (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
+                (~self.standardised_asset_list['non-intrusives: Material'].isin(
+                    ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"]
+                )
+                 )
+            )
+
+            # Adjust
+            self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where(
+                self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"],
+                False,
+                self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"]
+            )
+
             ######################################################
             # Solar
             ######################################################
@@ -1109,8 +1127,7 @@ class AssetList:
                 ) | (
                     self.standardised_asset_list[
                         "walls_u_value"].apply(
-                        lambda x: x <= 0.7 if not pd.isnull(
-                            x) else False
+                        lambda x: x <= 0.7 if not pd.isnull(x) else False
                     )
                 )
             )
@@ -1322,26 +1339,58 @@ class AssetList:
                 # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"]
             )
 
+        blocks_of_flats = self.standardised_asset_list[
+            self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats"
+            ]
+
+        non_blocks_of_flats = self.standardised_asset_list[
+            self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats"
+            ]
+
         # Produce some aggregate figures
         self.work_type_figures = {
             # Empty cavity from non-intrusives
-            "Empty Cavity (non-intrusives)": (
-                self.standardised_asset_list["non_intrusive_indicates_empty_cavity"].sum()
+            "Empty Cavity (non-intrusives)": non_blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum(),
+            "Empty Cavity (non-intrusives, blocks of flats)": (
+                blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum()
             ),
             "Empty Cavity (non-intrusives, no SAP filter)": (
-                self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum()
+                non_blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum()
+            ),
+            "Empty Cavity (non-intrusives, no SAP filter, blocks of flats)": (
+                blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum()
             ),
             "Empty Cavity (EPC)": (
                 (
-                    self.standardised_asset_list["epc_indicates_empty_cavity"] &
-                    ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]
+                    non_blocks_of_flats["epc_indicates_empty_cavity"] &
+                    ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"]
+                ).sum()
+            ),
+            "Empty Cavity (EPC, blocks of flat)": (
+                (
+                    blocks_of_flats["epc_indicates_empty_cavity"] &
+                    ~blocks_of_flats["non_intrusive_indicates_empty_cavity"]
                 ).sum()
             ),
             "Cavity Extraction": (
+                (
+                    ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] &
+                    ~non_blocks_of_flats["epc_indicates_empty_cavity"] &
+                    non_blocks_of_flats["non_intrusive_indicates_cavity_extraction"]
+                ).sum()
+            ),
+            "Cavity Extraction (blocks of flats)": (
+                (
+                    ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] &
+                    ~blocks_of_flats["epc_indicates_empty_cavity"] &
+                    blocks_of_flats["non_intrusive_indicates_cavity_extraction"]
+                ).sum()
+            ),
+            "Cavity Extraction (no SAP filter)": (
                 (
                     ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
                     ~self.standardised_asset_list["epc_indicates_empty_cavity"] &
-                    self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"]
+                    self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"]
                 ).sum()
             ),
             "Solar PV (Solid Floor)": (
@@ -1398,6 +1447,15 @@ class AssetList:
             "Non-Intrusive Data Showed Cavity Extraction",
             self.standardised_asset_list["cavity_reason"]
         )
+        # extraction no sap filter
+        self.standardised_asset_list["cavity_reason"] = np.where(
+            (
+                self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] &
+                pd.isnull(self.standardised_asset_list["cavity_reason"])
+            ),
+            "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed",
+            self.standardised_asset_list["cavity_reason"]
+        )
 
         # Flag solar
         self.standardised_asset_list["solar_reason"] = None
diff --git a/asset_list/app.py b/asset_list/app.py
index 09ccac02..84999e93 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -246,43 +246,43 @@ def app():
     # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
     # - Or the insulation required is loft/cavity (floors should be solid)
 
-    # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
-    # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
-    # sheet_name = "Sheet1"
-    # postcode_column = 'Full Address.1'
-    # fulladdress_column = "Full Address"
-    # address1_column = None
-    # address1_method = "first_word"
-    # address_cols_to_concat = []
-    # missing_postcodes_method = None
-    # landlord_year_built = "Build Date"
-    # landlord_os_uprn = None
-    # landlord_property_type = "Property Type"
-    # landlord_wall_construction = "Wallinsul"
-    # landlord_heating_system = "HeatSorc"
-    # landlord_existing_pv = None
-    # landlord_property_id = "Property Reference"
-
-    # For Westward
-    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
-    data_filename = "WESTWARD - completed list..xlsx"
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+    data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
     sheet_name = "Sheet1"
-    postcode_column = "WFT EDIT Postcode"
-    fulladdress_column = "Address"
+    postcode_column = 'Full Address.1'
+    fulladdress_column = "Full Address"
     address1_column = None
-    address1_method = "house_number_extraction"
+    address1_method = "first_word"
     address_cols_to_concat = []
     missing_postcodes_method = None
-    landlord_year_built = "Build date"
-    landlord_os_uprn = "UPRN"
-    landlord_property_type = "Location type"
-    landlord_wall_construction = "Wall Construction (EPC)"
-    landlord_heating_system = "Heat Source"
-    landlord_existing_pv = "PV (Y/N)"
-    landlord_property_id = "Place ref"
+    landlord_year_built = "Build Date"
+    landlord_os_uprn = None
+    landlord_property_type = "Property Type"
+    landlord_wall_construction = "Wallinsul"
+    landlord_heating_system = "HeatSorc"
+    landlord_existing_pv = None
+    landlord_property_id = "Property Reference"
+
+    # For Westward
+    # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
+    # data_filename = "WESTWARD - completed list..xlsx"
+    # sheet_name = "Sheet1"
+    # postcode_column = "WFT EDIT Postcode"
+    # fulladdress_column = "Address"
+    # address1_column = None
+    # address1_method = "house_number_extraction"
+    # address_cols_to_concat = []
+    # missing_postcodes_method = None
+    # landlord_year_built = "Build date"
+    # landlord_os_uprn = "UPRN"
+    # landlord_property_type = "Location type"
+    # landlord_wall_construction = "Wall Construction (EPC)"
+    # landlord_heating_system = "Heat Source"
+    # landlord_existing_pv = "PV (Y/N)"
+    # landlord_property_id = "Place ref"
 
     # Maps addresses to uprn in problematic cases
-    MANUAL_UPRN_MAP = {}
+    manual_uprn_map = {}
 
     asset_list = AssetList(
         local_filepath=os.path.join(data_folder, data_filename),
@@ -352,7 +352,7 @@ def app():
         epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
             df=chunk,
             row_id_name=asset_list.DOMNA_PROPERTY_ID,
-            manual_uprn_map=MANUAL_UPRN_MAP,
+            manual_uprn_map=manual_uprn_map,
         )
 
         # We now retrieve any failed properties
@@ -360,7 +360,7 @@ def app():
         epc_data_failed, _, _ = get_data(
             df=chunk_failed,
             row_id_name=asset_list.DOMNA_PROPERTY_ID,
-            manual_uprn_map=MANUAL_UPRN_MAP,
+            manual_uprn_map=manual_uprn_map,
             epc_api_only=False
         )
 
@@ -464,6 +464,7 @@ def app():
     )
     cleaned = msgpack.unpackb(cleaned, raw=False)
 
+    # TODO: We should break out the identification of work types to flag blocks of flats specifically
     asset_list.identify_worktypes(cleaned)
 
     pprint(asset_list.work_type_figures)
diff --git a/backend/Funding.py b/backend/Funding.py
index f0780c51..2839c7ff 100644
--- a/backend/Funding.py
+++ b/backend/Funding.py
@@ -149,7 +149,8 @@ class Funding:
         :return:
         """
         measure_table = pd.DataFrame([
-            m for m in self.recommendations if m in measures and m["default"]
+            m for m in self.recommendations if
+            (m["type"] in measures) or (m["measure_type"] in measures) and m["default"]
         ])
 
         measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap
@@ -180,13 +181,10 @@ class Funding:
         measure_table["cost_minus_funding"] = measure_table["total"] - measure_table["estimated_funding"]
         measure_table["cost_minus_funding_per_sap"] = measure_table["cost_minus_funding"] / measure_table["sap_points"]
         measure_table = measure_table.sort_values(["cost_minus_funding_per_sap", "total"], ascending=[True, False])
-        # Recommend the measure, with estimated funding amount
-        recommended_measure = measure_table.head(1)
 
-        return {
-            "measure_type": recommended_measure["measure_type"],
-            "estimated_funding": recommended_measure["estimated_funding"]
-        }
+        return measure_table[
+            ["type", "measure_type", "Cost Savings", "estimated_funding"]
+        ].rename(columns={"Cost Savings": "project_score"}).to_dict("records")
 
     def sap_to_eco_band(self, sap_points):
         """
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 76c172ee..d82e774b 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -825,7 +825,7 @@ async def trigger_plan(body: PlanTriggerRequest):
                 property_recommendations=recommendations[p.id],
                 project_scores_matrix=eco_project_scores_matrix,
                 whlg_eligible_postcodes=whlg_eligible_postcodes,
-                gbis_abs_rate=20,
+                gbis_abs_rate=15,
                 eco4_abs_rate=15,
             )
             funding_calulator.check_eligibiltiy()
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index aac0a1a6..fc3b7ec6 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
 from utils.s3 import save_csv_to_s3
 from etl.find_my_epc.AssetListEpcData import AssetListEpcData
 
-PORTFOLIO_ID = 137
+PORTFOLIO_ID = 134
 USER_ID = 8
 
 load_dotenv(dotenv_path="backend/.env")
@@ -19,10 +19,25 @@ def app():
 
     asset_list = [
         {
-            "address": "41 Gainsborough Way",
-            "postcode": "BA21 5XU",
-            "uprn": 30016708,
+            "address": "Flat 2, 42 Malden Road, London NW5 3HG",
+            "postcode": "NW5 3HG",
+            "uprn": 5117165,
         },
+        {
+            "address": "15 Bournville Lane",
+            "postcode": "B30 2JY",
+            "uprn": 100070301128
+        },
+        {
+            "address": "34 Bournville Lane",
+            "postcode": "B30 2LN",
+            "uprn": 100070301140
+        },
+        {
+            "address": "36 Bournville Lane",
+            "postcode": "B30 2LN",
+            "uprn": 100070301142
+        }
     ]
     asset_list = pd.DataFrame(asset_list)
 
@@ -52,9 +67,21 @@ def app():
 
     valuation_data = [
         {
-            "uprn": 30016708,
-            "valuation": 189000
-        }
+            "uprn": 5117165,
+            "valuation": 467_000
+        },
+        {
+            "uprn": 100070301128,
+            "valuation": 335_000
+        },
+        {
+            "uprn": 100070301140,
+            "valuation": 276_000
+        },
+        {
+            "uprn": 100070301142,
+            "valuation": 276_000
+        },
     ]
     # Store valuation data to s3
     valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv"
diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py
index bce8cd1f..1d2e1472 100644
--- a/etl/find_my_epc/AssetListEpcData.py
+++ b/etl/find_my_epc/AssetListEpcData.py
@@ -72,12 +72,20 @@ class AssetListEpcData:
             epc_searcher.find_property(skip_os=True)
             if epc_searcher.newest_epc is None:
                 continue
-
-            find_epc_searcher = RetrieveFindMyEpc(
-                address=epc_searcher.newest_epc["address1"],
-                postcode=epc_searcher.newest_epc["postcode"]
-            )
-            find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+            # Attempt both methods:
+            try:
+                find_epc_searcher = RetrieveFindMyEpc(
+                    address=epc_searcher.newest_epc["address1"] + ", " + epc_searcher.newest_epc["address2"],
+                    postcode=epc_searcher.newest_epc["postcode"]
+                )
+                find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+            except Exception as e:
+                logger.error(f"Error retrieving find my epc data: {e}")
+                find_epc_searcher = RetrieveFindMyEpc(
+                    address=epc_searcher.newest_epc["address1"],
+                    postcode=epc_searcher.newest_epc["postcode"]
+                )
+                find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
             time.sleep(0.5)
             # We need uprn
 
diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py
index dd81680a..e4dd3a78 100644
--- a/recommendations/HeatingRecommender.py
+++ b/recommendations/HeatingRecommender.py
@@ -852,6 +852,8 @@ class HeatingRecommender:
         else:
             heating_simulation_config["mainheat_energy_eff_ending"] = self.property.data["mainheat-energy-eff"]
 
+        # TODO:We possibly shouldn't touch the hot water energy efficiency if we aren't recommending dual immersion
+        #      we'll keep this for the moment though
         if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]:
             heating_simulation_config["hot_water_energy_eff_ending"] = "Average"
         else: