From 1bbc89002cc448813cf06d9bf6f1facbc7bc25ca Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 23 Dec 2023 13:57:51 +0000
Subject: [PATCH 01/40] building ha7 pipeline

---
 backend/SearchEpc.py                |   4 +-
 etl/eligibility/Eligibility.py      |   3 +-
 etl/eligibility/ha_15_32/ha7_app.py | 155 ++++++++++++++++++++++++++++
 3 files changed, 160 insertions(+), 2 deletions(-)
 create mode 100644 etl/eligibility/ha_15_32/ha7_app.py

diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index d8ea6b78..f1cda010 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -202,7 +202,9 @@ class SearchEpc:
             return {}, []
 
         if len(newest_response) != 1:
-            raise Exception("More than one result found for this address - investigate me")
+            # It is possible (but rare, and likely an error on EPC lodgement) that we have multiple EPCs that
+            # were lodged at the exact same time. In this case, we will take the first one
+            newest_response = [newest_response[0]]
 
         older_epcs = [epc for epc in list_of_epcs if epc["lmk-key"] != newest_response[0]["lmk-key"]]
 
diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py
index c4dc9de0..7a6fade1 100644
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@@ -331,9 +331,10 @@ class Eligibility:
         is_eligible = self.cavity["suitability"] & self.loft["suitability"]
 
         if post_retrofit_sap is None:
+            message = "subject to post retrofit sap" if is_eligible else "not eligible"
             self.eco4_warmfront = {
                 "eligible": is_eligible,
-                "message": "subject to post retrofit sap"
+                "message": message
             }
             return
 
diff --git a/etl/eligibility/ha_15_32/ha7_app.py b/etl/eligibility/ha_15_32/ha7_app.py
new file mode 100644
index 00000000..139943a1
--- /dev/null
+++ b/etl/eligibility/ha_15_32/ha7_app.py
@@ -0,0 +1,155 @@
+import msgpack
+import openpyxl
+from openpyxl.styles.colors import COLOR_INDEX
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from backend.app.utils import read_parquet_from_s3
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+
+import re
+
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+def load_data():
+    """
+    Load the data from the excel
+    """
+
+    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 7 ASSET LIST.xlsx')
+    sheet = workbook.active
+
+    # Prepare lists to collect rows data and their colors
+    rows_data = []
+    rows_colors = []
+    for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        row_color = COLOR_INDEX[row_color]
+        rows_data.append(row_data)
+        rows_colors.append(row_color)
+
+    df = pd.DataFrame(rows_data, columns=[cell.value for cell in sheet[1]])
+
+    # Add the row colors as a new column
+    df['row_color'] = rows_colors
+    df.columns.values[8] = "is_active"
+
+    # Remove None columns
+    df = df.dropna(axis=1, how='all')
+    # We now parse the colours
+    df["row_color"].unique()
+    df["row_colour_name"] = np.where(
+        df["row_color"] == "0000FFFF", "red",
+        np.where(df["row_color"] == "00FF00FF", "green", "yellow")
+    )
+    df["row_code"] = np.where(
+        df["row_colour_name"] == "red", "invalid",
+        np.where(df["row_colour_name"] == "green", "potential ECO4", "needs criteria change")
+    )
+
+    return df
+
+
+def get_ha7_data(data, cleaned, cleaning_data, created_at):
+    property_type_lookup = {
+        "Mid Terrace": "Mid-Terrace",
+        "End Terrace": "End-Terrace",
+        "Semi Detached": "Semi-Detached",
+        "Detached": "Detached",
+    }
+
+    scoring_data = []
+    results = []
+    nodata = []
+    for _, house in tqdm(data.iterrows(), total=len(data)):
+
+        searcher = SearchEpc(
+            address1=house["Address"],
+            postcode=house["Postcode"]
+        )
+
+        response = searcher.search()
+        if response["status"] == 204:
+            nodata.append(house)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(
+            property_type=property_type_lookup.get(house["Property Type"], None),
+            address=house["Address"],
+        )
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        # If the house is not identified, we do a full gbis and eco4 check
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            scoring_dictionary = prepare_model_data_row(
+                property_id=house["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        # If nothing is eligible or gbis is eligible, then we make a record this
+        results.append(
+            {
+                "row_id": house["row_id"],
+                "address": house["Address"],
+                "postcode": house["Postcode"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+
+def app():
+    data = load_data()
+    data["row_id"] = ["ha7" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()

From 64d42aba67fc601317aa22a971b6d6465a244246 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 23 Dec 2023 15:25:47 +0000
Subject: [PATCH 02/40] ha7

---
 etl/eligibility/ha_15_32/ha7_app.py | 132 ++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha7_app.py b/etl/eligibility/ha_15_32/ha7_app.py
index 139943a1..7d856366 100644
--- a/etl/eligibility/ha_15_32/ha7_app.py
+++ b/etl/eligibility/ha_15_32/ha7_app.py
@@ -137,6 +137,131 @@ def get_ha7_data(data, cleaned, cleaning_data, created_at):
             }
         )
 
+    scoring_df = pd.DataFrame(scoring_data)
+    # Implement the same process that is being used in the recommendation engine to cleaning scoring_df
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+
+    return results_df, scoring_data, nodata
+
+
+def analyse_ha_7(results_df, data):
+    df = results_df.merge(
+        data[["row_id", "row_code", "Property Type"]], how="left", on="row_id"
+    )
+    warmfront_identification = df["row_code"].value_counts()
+    warmfront_identified = df[df["row_code"] == "potential ECO4"]
+
+    property_types = df["Property Type"].value_counts()
+
+    n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum()
+
+    eco_identified = results_df[results_df["eco4_eligible"]]
+    n_eco4 = eco_identified["eco4_eligible"].sum()
+    gbis_identified = results_df[~results_df["eco4_eligible"] & results_df["gbis_eligible"]]
+    n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum()
+
+    eco_eligibile = results_df[results_df["eco4_eligible"]]
+    eco_eligibile["eligibility_classification"].value_counts()
+
+    future_possibilities_eco = results_df[
+        (results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
+        ].copy()
+
+    future_possibilities_gbis = results_df[
+        (results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & (
+            ~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
+        ].copy()
+
+    total_future_possibilities = future_possibilities_eco.shape[0] + future_possibilities_gbis.shape[0]
+
 
 def app():
     data = load_data()
@@ -153,3 +278,10 @@ def app():
     )
 
     created_at = datetime.now().isoformat()
+
+    results_df, scoring_data, nodata = get_ha7_data(data, cleaned, cleaning_data, created_at)
+
+    # Pickle results
+    # import pickle
+    # with open("ha7_results.pkl", "wb") as f:
+    #     pickle.dump({"results_df": results_df, "scoring_data": scoring_data, "nodata": nodata}, f)

From 43004a5d8beac085d6eca1c6a588941abea97173 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 24 Dec 2023 15:48:36 +0000
Subject: [PATCH 03/40] working on merge between asset list and survey list

---
 etl/eligibility/ha_15_32/ha16_app.py | 115 +++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 etl/eligibility/ha_15_32/ha16_app.py

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
new file mode 100644
index 00000000..e347d47c
--- /dev/null
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -0,0 +1,115 @@
+import msgpack
+import openpyxl
+from openpyxl.styles.colors import COLOR_INDEX
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from backend.app.utils import read_parquet_from_s3
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+
+import re
+
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+def load_data():
+    # This asset list is spread across two sheets, which we need to combine
+
+    asset_list_filenames = [
+        "HESTIA - HA 16 ASSET LIST PART 1 OF 2.xlsx",
+        "HESTIA - HA 16 ASSET LIST PART 2 OF 2.xlsx",
+    ]
+
+    # Prepare lists to collect rows data and their colors
+    rows_data = []
+    rows_colors = []
+    colnames = []
+    for asset_list_filename in asset_list_filenames:
+        workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/{asset_list_filename}')
+        sheet = workbook.active
+        sheet_colnames = [cell.value for cell in sheet[1]]
+        colnames.append(sheet_colnames)
+
+        for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+            row_data = [cell.value for cell in row]  # This will get you the cell values
+            row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+            # row_color = COLOR_INDEX[row_color]
+            rows_data.append(row_data)
+            rows_colors.append(row_color)
+
+    asset_list = pd.DataFrame(rows_data, columns=colnames[0])
+    # Remove None columns
+    asset_list = asset_list.iloc[:, 0:12]
+    asset_list['row_color'] = rows_colors
+
+    asset_list["row_colour_name"] = np.where(
+        asset_list["row_color"] == "FFFF0000", "red",
+        np.where(asset_list["row_color"] == "FF92D050", "green", "yellow")
+    )
+
+    # Split up the address on commas, which is useful for matching later
+    split_addresses = asset_list['Address'].str.split(',', expand=True)
+    split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']
+
+    asset_list = pd.concat([asset_list, split_addresses], axis=1)
+    # There is no commas separating house number and address 1
+    split_addresses2 = asset_list['temp'].str.split(' ', expand=True)
+    split_addresses2.columns = ['HouseNo', 'part1', 'part2', "part3", "part4"]
+    # We could re-concatenate but we only care about HouseNo for the moment
+    asset_list = pd.concat([asset_list, split_addresses2[["HouseNo"]]], axis=1)
+
+    # We now read in the survey list
+    survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA- HA 16 ECO4 SURVEY LIST.xlsx')
+    survey_sheet = survey_workbook.active
+
+    survey_rows = []
+    survey_colors = []
+
+    for row in survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        survey_rows.append(row_data)
+        survey_colors.append(row_color)
+
+    survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
+
+    # For the survey list, we don't need the colours, since there is a column called "INSTALLED OR CANCELLED"
+    # which describes the status of the property
+    survey_list["row_colour"] = survey_colors
+    survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))]
+    # Tidy up the street/block name a bit
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
+
+    # We now need to merge the survey list onto the asset list
+    # Could be easier just to do a search on each row, even though it's much slower
+    matched = []
+    for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
+        # Filter on the first line of the address
+        df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
+        df = df[df["Postcode"].str.contains(row["Post Code"])]
+        df = df[df["Address"].str.contains(str(row["NO."]))]
+        if df.shape[0] != 1:
+            df = df[df["HouseNo"] == str(row["NO."])]
+            if df.shape[0] != 1:
+                raise ValueError("Investigate")
+
+        matched.append(
+            {
+                "survey_key": row["survey_key"],
+                "matched_address": df["Address"].values[0]
+            }
+        )

From 3f7ad82b7aabd4a34e3ea852dc9496992ae6152e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 24 Dec 2023 22:28:03 +0000
Subject: [PATCH 04/40] working on matching

---
 etl/eligibility/ha_15_32/ha16_app.py | 41 ++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index e347d47c..25b33255 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -93,19 +93,54 @@ def load_data():
     survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))]
     # Tidy up the street/block name a bit
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
+    survey_list["Street / Block Name"] = np.where(
+        survey_list["Street / Block Name"] == "REEDS RD",
+        "Reeds ROAD",
+        survey_list["Street / Block Name"]
+    )
+    # Replace " rd " with "road"
+    survey_list['Street / Block Name'] = df['Street / Block Name'].str.replace(r'\brd\b', 'road', regex=True)
+
+    # Replace " , " with ", "
+    survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(
+        " , ", ', ',
+    )
+    # Strip whitespace
+    survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip()
+
+    # Correct errors
+    survey_list["Post Code"] = np.where(
+        survey_list["Post Code"] == "M38 0SA",
+        "M38 9SA",
+        survey_list["Post Code"]
+    )
+
+    survey_list["Post Code"] = np.where(
+        (survey_list["Street / Block Name"] == "nelson drive") & (survey_list["Post Code"] == "M44 5JE"),
+        "M44 5JF",
+        survey_list["Post Code"]
+    )
 
     # We now need to merge the survey list onto the asset list
     # Could be easier just to do a search on each row, even though it's much slower
     matched = []
     for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
+
+        house_number = row["NO."]
+        if isinstance(house_number, str):
+            house_number = house_number.lower()
+
         # Filter on the first line of the address
         df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
-        df = df[df["Postcode"].str.contains(row["Post Code"])]
-        df = df[df["Address"].str.contains(str(row["NO."]))]
+        # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+        df = df[df["Address"].str.lower().str.contains(str(house_number))]
         if df.shape[0] != 1:
             df = df[df["HouseNo"] == str(row["NO."])]
             if df.shape[0] != 1:
-                raise ValueError("Investigate")
+                df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                if df.shape[0] != 1:
+                    raise ValueError("Investigate")
 
         matched.append(
             {

From e21057ca6120b4c75e95a04b2c7dbd894e172925 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 24 Dec 2023 23:09:25 +0000
Subject: [PATCH 05/40] still working on merge

---
 etl/eligibility/ha_15_32/ha16_app.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 25b33255..0d5a3361 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -100,12 +100,14 @@ def load_data():
         survey_list["Street / Block Name"]
     )
     # Replace " rd " with "road"
-    survey_list['Street / Block Name'] = df['Street / Block Name'].str.replace(r'\brd\b', 'road', regex=True)
+    survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\brd\b', 'road', regex=True)
 
     # Replace " , " with ", "
     survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(
         " , ", ', ',
     )
+    # Fix "{place} ,{place}" with "{place}, {place}"
+    survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\s*,\s*', ', ', regex=True)
     # Strip whitespace
     survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip()
 
@@ -122,11 +124,18 @@ def load_data():
         survey_list["Post Code"]
     )
 
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road", "chatley road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan")
+
     # We now need to merge the survey list onto the asset list
     # Could be easier just to do a search on each row, even though it's much slower
     matched = []
     for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
 
+        if row["Street / Block Name"] in ["carleach grove"]:
+            continue
+
         house_number = row["NO."]
         if isinstance(house_number, str):
             house_number = house_number.lower()

From 85ad1d3fc5a2c49bb88e139ec4252b2845d06e9b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 24 Dec 2023 23:45:25 +0000
Subject: [PATCH 06/40] handling more errors

---
 etl/eligibility/ha_15_32/ha16_app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 0d5a3361..e98f8ddc 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -127,6 +127,7 @@ def load_data():
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road", "chatley road")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent")
 
     # We now need to merge the survey list onto the asset list
     # Could be easier just to do a search on each row, even though it's much slower

From 78827b4743f2bcff576d3a748cd4367a99d10d72 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 25 Dec 2023 00:28:50 +0000
Subject: [PATCH 07/40] a third through

---
 etl/eligibility/ha_15_32/ha16_app.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index e98f8ddc..80a6b59d 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -128,13 +128,15 @@ def load_data():
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road", "chatley road")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road",
+                                                                                        "plantation avenue")
 
     # We now need to merge the survey list onto the asset list
     # Could be easier just to do a search on each row, even though it's much slower
     matched = []
     for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
 
-        if row["Street / Block Name"] in ["carleach grove"]:
+        if row["Street / Block Name"] in ["carleach grove", "arbour grove"]:
             continue
 
         house_number = row["NO."]

From 4dc827037dbb825e4f85534e2b744f3069a93375 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 14:16:11 +0000
Subject: [PATCH 08/40] handling case for 66-68 clively

---
 etl/eligibility/ha_15_32/ha16_app.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 80a6b59d..bf9e7792 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -130,6 +130,14 @@ def load_data():
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road",
                                                                                         "plantation avenue")
+    # Replacement for clively avenue 66-68
+    survey_list["NO."] = np.where(
+        survey_list["NO."] == "66-68",
+        "66",
+        survey_list["NO."]
+    )
+
+    # asset_list[asset_list["Address"].str.lower().str.contains("clively")]
 
     # We now need to merge the survey list onto the asset list
     # Could be easier just to do a search on each row, even though it's much slower

From 94d322b442bcf6db5a37f3c8a0b46684c99d6ecd Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 14:19:15 +0000
Subject: [PATCH 09/40] fixed more spelling issues in survey file

---
 etl/eligibility/ha_15_32/ha16_app.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index bf9e7792..94c0bab0 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -130,6 +130,10 @@ def load_data():
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road",
                                                                                         "plantation avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("how clough drive",
+                                                                                        "howclough drive")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane",
+                                                                                        "brookhurst lane")
     # Replacement for clively avenue 66-68
     survey_list["NO."] = np.where(
         survey_list["NO."] == "66-68",

From b47ca2b50563ea8dfeefb1f72d98fae9caa9d2b2 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 14:24:57 +0000
Subject: [PATCH 10/40] handling more error cases

---
 etl/eligibility/ha_15_32/ha16_app.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 94c0bab0..4e5212eb 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -134,6 +134,14 @@ def load_data():
                                                                                         "howclough drive")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane",
                                                                                         "brookhurst lane")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("biirch road",
+                                                                                        "birch road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hadson road",
+                                                                                        "hodson road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("harbonne avennue",
+                                                                                        "narbonne avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cumberland road, cadishead",
+                                                                                        "cumberland avenue, cadishead")
     # Replacement for clively avenue 66-68
     survey_list["NO."] = np.where(
         survey_list["NO."] == "66-68",

From cab7b5ec3e101a08d644a70ea39e1c40d05508b6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 14:34:02 +0000
Subject: [PATCH 11/40] further debuggin

---
 etl/eligibility/ha_15_32/ha16_app.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 4e5212eb..26cf1a0d 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -142,6 +142,16 @@ def load_data():
                                                                                         "narbonne avenue")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cumberland road, cadishead",
                                                                                         "cumberland avenue, cadishead")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("aston field drive",
+                                                                                        "ashton field drive")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wedgewood road",
+                                                                                        "wedgwood road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hamilton close",
+                                                                                        "hamilton avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("lichens crescent, fitton hill",
+                                                                                        "lichens crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill",
+                                                                                        "south croft")
     # Replacement for clively avenue 66-68
     survey_list["NO."] = np.where(
         survey_list["NO."] == "66-68",
@@ -168,7 +178,7 @@ def load_data():
         # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
         df = df[df["Address"].str.lower().str.contains(str(house_number))]
         if df.shape[0] != 1:
-            df = df[df["HouseNo"] == str(row["NO."])]
+            df = df[df["HouseNo"] == str(house_number)]
             if df.shape[0] != 1:
                 df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
                 if df.shape[0] != 1:

From fd4b9baefb055f5b612d016542b29f6a2500dd7d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 19:33:43 +0000
Subject: [PATCH 12/40] cleaning

---
 etl/eligibility/ha_15_32/ha16_app.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 26cf1a0d..a16fc37d 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -152,6 +152,13 @@ def load_data():
                                                                                         "lichens crescent")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill",
                                                                                         "south croft")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(", fitton hill", "")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("firtree dr", "fir tree avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hawthorne road",
+                                                                                        "hawthorn crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue",
+                                                                                        "reins lee avenue")
+
     # Replacement for clively avenue 66-68
     survey_list["NO."] = np.where(
         survey_list["NO."] == "66-68",

From a45ac1e14f834670211e0f1de69a3ea5a7a46c7f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 19:45:47 +0000
Subject: [PATCH 13/40] multiple further corrections

---
 etl/eligibility/ha_15_32/ha16_app.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index a16fc37d..76c4ad14 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -158,6 +158,25 @@ def load_data():
                                                                                         "hawthorn crescent")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue",
                                                                                         "reins lee avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("westerhill road",
+                                                                                        "wester hill road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("st martins road",
+                                                                                        "saint martins road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("timperley avenue",
+                                                                                        "timperley close")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eastwood road",
+                                                                                        "eastwood avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("new road", "new street")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grassmere road",
+                                                                                        "grasmere road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hulton road",
+                                                                                        "hulton avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("beechfield avenue",
+                                                                                        "beechfield road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue",
+                                                                                        "princes avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent",
+                                                                                        "edge fold crecent")
 
     # Replacement for clively avenue 66-68
     survey_list["NO."] = np.where(

From 95dd23e925c6d8f42bbdddf542ee92aaaafaca71 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 19:52:02 +0000
Subject: [PATCH 14/40] multiple further corrections

---
 etl/eligibility/ha_15_32/ha16_app.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 76c4ad14..553a86da 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -176,7 +176,13 @@ def load_data():
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue",
                                                                                         "princes avenue")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent",
-                                                                                        "edge fold crecent")
+                                                                                        "edge fold crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("conniston avenue",
+                                                                                        "coniston avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent",
+                                                                                        "blackthorn crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road",
+                                                                                        "aellstock lane")
 
     # Replacement for clively avenue 66-68
     survey_list["NO."] = np.where(

From 8f38996391c875d0ccbd187b42cddb93705628f1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 21:23:00 +0000
Subject: [PATCH 15/40] completed amtching

---
 etl/eligibility/ha_15_32/ha16_app.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 553a86da..bc8fbfbd 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -182,7 +182,17 @@ def load_data():
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent",
                                                                                         "blackthorn crescent")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road",
-                                                                                        "aellstock lane")
+                                                                                        "wellstock lane")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brackley avenue",
+                                                                                        "brackley street")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brook avenue swinton",
+                                                                                        "brook avenue, swinton")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("green avenue swinton",
+                                                                                        "green avenue, swinton")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grasmere avenue wardley",
+                                                                                        "grasmere avenue, wardley")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle",
+                                                                                        "mardale avenue, wardle")
 
     # Replacement for clively avenue 66-68
     survey_list["NO."] = np.where(

From e6a25ab7beea79e33cbaa5478c7ba675a8179feb Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 26 Dec 2023 21:32:14 +0000
Subject: [PATCH 16/40] fixed missed cases and completed merge between assets
 and surveys

---
 etl/eligibility/ha_15_32/ha16_app.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index bc8fbfbd..402527df 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -193,6 +193,10 @@ def load_data():
                                                                                         "grasmere avenue, wardley")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle",
                                                                                         "mardale avenue, wardle")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("carleach grove",
+                                                                                        "cartleach Grove")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("arbour grove",
+                                                                                        "arbor Grove")
 
     # Replacement for clively avenue 66-68
     survey_list["NO."] = np.where(
@@ -208,9 +212,6 @@ def load_data():
     matched = []
     for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
 
-        if row["Street / Block Name"] in ["carleach grove", "arbour grove"]:
-            continue
-
         house_number = row["NO."]
         if isinstance(house_number, str):
             house_number = house_number.lower()
@@ -229,6 +230,21 @@ def load_data():
         matched.append(
             {
                 "survey_key": row["survey_key"],
-                "matched_address": df["Address"].values[0]
+                "matched_address": df["Address"].values[0],
+                "survey_house_no": row["NO."],
+                "survey_street_name": row["Street / Block Name"],
+                "survey_postcode": row["Post Code"],
+                "survey_status": row["INSTALLED OR CANCELLED"]
             }
         )
+
+    matched = pd.DataFrame(matched)
+    matched["warmfront_identified"] = True
+
+    # Combine asset list and surveys
+    data = asset_list.merge(
+        matched, how="left", left_on="Address", right_on="matched_address",
+    )
+    data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
+
+    return data

From a948688df6c4e220be03688700834fbb78701ccd Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 10:28:06 +0000
Subject: [PATCH 17/40] set up get_epc_data function

---
 etl/eligibility/ha_15_32/ha16_app.py | 98 ++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 402527df..f930452e 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -248,3 +248,101 @@ def load_data():
     data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
 
     return data
+
+
+def get_epc_data(data, cleaned, cleaning_data, created_at):
+    scoring_data = []
+    results = []
+    nodata = []
+
+    for _, property_meta in tqdm(data.iterrows(), total=len(data)):
+        searcher = SearchEpc(
+            address1=property_meta["HouseNo"],
+            postcode=property_meta["Postcode"],
+            size=1000
+        )
+        searcher.search()
+
+        if searcher.data is None:
+            nodata.append(property_meta)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["Address"])
+        # We also want to get the penultimate epc
+        penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+        if not penultimate_epc:
+            penultimate_epc = newest_epc
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (
+            property_meta["warmfront_identified"]
+        ):
+            eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+            eligibility.check_gbis_warmfront()
+            eligibility.check_eco4_warmfront()
+            # If this is the case, we need to update the older epcs
+            older_epcs = [
+                x for x in older_epcs if x["lmk-key"] not in [newest_epc["lmk-key"], penultimate_epc["lmk-key"]]
+            ]
+
+        # Full checks
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            scoring_dictionary = prepare_model_data_row(
+                property_id=eligibility.epc["uprn"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        results.append(
+            {
+                "row_id": property_meta["row_id"],
+                "uprn": eligibility.epc["uprn"],
+                "Address": property_meta["Address"],
+                "Postcode": property_meta["Postcode"],
+                "property_type": eligibility.epc["property-type"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "cavity_type": eligibility.cavity["type"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+
+def app():
+    data = load_data()
+
+    data["row_id"] = ["ha16" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()

From 29f8fecdd0fde658beb7a0f04fb7f2bde541291d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 10:49:24 +0000
Subject: [PATCH 18/40] debugging issue with missing uprn

---
 etl/eligibility/ha_15_32/ha16_app.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index f930452e..a1d25c53 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -293,8 +293,11 @@ def get_epc_data(data, cleaned, cleaning_data, created_at):
         eligibility.check_eco4()
 
         if eligibility.eco4_warmfront["eligible"]:
+            if eligibility.epc["uprn"] == "":
+                eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
+
             scoring_dictionary = prepare_model_data_row(
-                property_id=eligibility.epc["uprn"],
+                property_id=property_meta["row_id"],
                 modelling_epc=eligibility.epc,
                 cleaned=cleaned,
                 cleaning_data=cleaning_data,
@@ -333,7 +336,7 @@ def get_epc_data(data, cleaned, cleaning_data, created_at):
 def app():
     data = load_data()
 
-    data["row_id"] = ["ha16" + str(i) for i in range(0, len(data))]
+    data["row_id"] = ["ha16_" + str(i) for i in range(0, len(data))]
 
     cleaned = read_from_s3(
         s3_file_name="cleaned_epc_data/cleaned.bson",

From a7c95107471667ad3ad9bbe44f7616f1c9e931d6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 10:59:01 +0000
Subject: [PATCH 19/40] Creating load data function for ha24

---
 etl/eligibility/ha_15_32/ha24_app.py | 55 ++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 etl/eligibility/ha_15_32/ha24_app.py

diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
new file mode 100644
index 00000000..ab639003
--- /dev/null
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -0,0 +1,55 @@
+import msgpack
+import openpyxl
+from openpyxl.styles.colors import COLOR_INDEX
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from backend.app.utils import read_parquet_from_s3
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+
+import re
+
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+def load_data():
+    workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 24 ASSET LIST.xlsx')
+    sheet = workbook.active
+    sheet_colnames = [cell.value for cell in sheet[1]]
+
+    rows_data = []
+    rows_colors = []
+    for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        rows_data.append(row_data)
+        rows_colors.append(row_color)
+
+    asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
+    # Remove None columns
+    asset_list = asset_list.iloc[:, 0:10]
+    asset_list['row_color'] = rows_colors
+
+    asset_list["row_colour_name"] = np.where(
+        asset_list["row_color"] == "FFFF0000", "red",
+        np.where(asset_list["row_color"] == "FF92D050", "green", "yellow")
+    )
+
+    asset_list["row_colour_code"] = np.where(
+        asset_list["row_colour_name"] == "red", "does not meet criteria",
+        np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
+    )

From 66ad7c3a147c1d2573de478194bed9fb50197286 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 11:05:12 +0000
Subject: [PATCH 20/40] debugging ha7 eligibiltiy

---
 etl/eligibility/Eligibility.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py
index 7a6fade1..364be3cc 100644
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@@ -235,6 +235,14 @@ class Eligibility:
         }
 
     def suspended_floor_insulation(self):
+
+        if "no_data" in self.floor.keys():
+            if self.floor["no_data"]:
+                self.suspended_floor = {
+                    "suitability": False,
+                }
+                return
+
         is_suspended = self.floor["is_suspended"]
         is_insulated = self.floor["insulation_thickness"] in ["average", "above average"]
 
@@ -244,6 +252,14 @@ class Eligibility:
         return
 
     def solid_floor_insulation(self):
+
+        if "no_data" in self.floor.keys():
+            if self.floor["no_data"]:
+                self.solid_floor = {
+                    "suitability": False,
+                }
+                return
+
         is_solid = self.floor["is_solid"]
         is_insulated = self.floor["insulation_thickness"] in ["average", "above average"]
 

From 88c245750d87b681afe4757ba820ae655f4a8b72 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 12:17:38 +0000
Subject: [PATCH 21/40] Added analyse_results for ha16

---
 etl/eligibility/ha_15_32/ha16_app.py | 152 ++++++++++++++++++++++++++-
 etl/eligibility/ha_15_32/ha24_app.py |  18 ++++
 2 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index a1d25c53..678bf76f 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -247,7 +247,7 @@ def load_data():
     )
     data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
 
-    return data
+    return data, survey_list
 
 
 def get_epc_data(data, cleaned, cleaning_data, created_at):
@@ -332,9 +332,144 @@ def get_epc_data(data, cleaned, cleaning_data, created_at):
             }
         )
 
+    scoring_df = pd.DataFrame(scoring_data)
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+    scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+    return results_df, scoring_data, nodata
+
+
+def analyse_results(results_df, data, survey_list):
+    analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge(
+        results_df, how="left", on="row_id"
+    ).merge(
+        survey_list[["survey_key", survey_list.columns[0]]].rename(columns={survey_list.columns[0]: "funding_scheme"}),
+        how="left", on="survey_key"
+    )
+
+    warmfront_identified = analysis_data[analysis_data["warmfront_identified"]]
+
+    # Of the ECO jobs, what proportion to we get right
+    warmfront_identified_eco = warmfront_identified[
+        warmfront_identified["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"])
+    ]
+
+    eco_success_rate = warmfront_identified_eco["eco4_eligible"].sum() / warmfront_identified_eco.shape[0]
+
+    warmfront_identified_gbis = warmfront_identified[
+        warmfront_identified["funding_scheme"].isin(["ECO4 GBIS (ECO+)"])
+    ]
+
+    gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0]
+
+    # Additional identified
+    additional_identified_eco = analysis_data[
+        (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False)
+        ].shape[0]
+    additional_identified_gbis = analysis_data[
+        (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+    # Future
+    additional_identified_eco_future = analysis_data[
+        (analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False)
+        ].shape[0]
+    additional_identified_gbis_future = analysis_data[
+        (analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+
 
 def app():
-    data = load_data()
+    data, survey_list = load_data()
 
     data["row_id"] = ["ha16_" + str(i) for i in range(0, len(data))]
 
@@ -349,3 +484,16 @@ def app():
     )
 
     created_at = datetime.now().isoformat()
+
+    results_df, scoring_data, nodata = get_epc_data(data, cleaned, cleaning_data, created_at)
+
+    # Store
+    # import pickle
+    # with open("ha16.pickle", "wb") as f:
+    #     pickle.dump(
+    #         {
+    #             "scoring_data": scoring_data,
+    #             "results": results_df,
+    #             "nodata": nodata
+    #         }, f
+    #     )
diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
index ab639003..b8d114b6 100644
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -53,3 +53,21 @@ def load_data():
         asset_list["row_colour_name"] == "red", "does not meet criteria",
         np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
     )
+
+    # Read in surveys
+    survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 24 ECO4 SURVEY LIST.xlsx')
+    survey_sheet = survey_workbook.active
+
+    survey_rows = []
+    survey_colors = []
+
+    for row in survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        survey_rows.append(row_data)
+        survey_colors.append(row_color)
+
+    survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
+    # Drop all None rows
+    survey_list = survey_list.dropna(how='all')

From c7972fc88d5f6b6dbb30a3cff47f7481d36d15c8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 13:14:52 +0000
Subject: [PATCH 22/40] finishing ha16

---
 etl/eligibility/ha_15_32/ha16_app.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py
index 678bf76f..7c1db158 100644
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@@ -451,7 +451,10 @@ def analyse_results(results_df, data, survey_list):
     # Additional identified
     additional_identified_eco = analysis_data[
         (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False)
-        ].shape[0]
+        ]
+
+    additional_identified_eco["eligibility_classification"].value_counts()
+
     additional_identified_gbis = analysis_data[
         (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & (
             analysis_data["warmfront_identified"] == False

From 975a9fa9a016540a3648aab03423dcd7c7169ef0 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 13:29:29 +0000
Subject: [PATCH 23/40] setting up ha24 matching

---
 etl/eligibility/ha_15_32/ha24_app.py | 51 ++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
index b8d114b6..44cc27e2 100644
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -54,6 +54,21 @@ def load_data():
         np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
     )
 
+    # The third column is listed as "Address" but it's actually the postcode". We have two Address columns so we
+    # change just the third
+    asset_list.columns.values[2] = "Postcode"
+
+    # Split up the address on commas, which is useful for matching later
+    split_addresses = asset_list['Address'].str.split(',', expand=True)
+    split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5', 'address6']
+
+    asset_list = pd.concat([asset_list, split_addresses], axis=1)
+    # There is no commas separating house number and address 1
+    split_addresses2 = asset_list['temp'].str.split(' ', expand=True)
+    split_addresses2.columns = ['HouseNo', 'part1', 'part2', "part3", "part4"]
+    # We could re-concatenate but we only care about HouseNo for the moment
+    asset_list = pd.concat([asset_list, split_addresses2[["HouseNo"]]], axis=1)
+
     # Read in surveys
     survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 24 ECO4 SURVEY LIST.xlsx')
     survey_sheet = survey_workbook.active
@@ -69,5 +84,41 @@ def load_data():
         survey_colors.append(row_color)
 
     survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
+
+    survey_list["row_colour"] = survey_colors
+    survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))]
+    # Tidy up the street/block name a bit
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
+
     # Drop all None rows
     survey_list = survey_list.dropna(how='all')
+    survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))]
+
+    matched = []
+    for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
+        house_number = row["NO."]
+        if isinstance(house_number, str):
+            house_number = house_number.lower()
+
+        # Filter on the first line of the address
+        df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
+        # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+        df = df[df["Address"].str.lower().str.contains(str(house_number))]
+        if df.shape[0] != 1:
+            df = df[df["HouseNo"] == str(house_number)]
+            if df.shape[0] != 1:
+                df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                if df.shape[0] != 1:
+                    raise ValueError("Investigate")
+
+        matched.append(
+            {
+                "survey_key": row["survey_key"],
+                "matched_address": df["Address"].values[0],
+                "survey_house_no": row["NO."],
+                "survey_street_name": row["Street / Block Name"],
+                "survey_postcode": row["Post Code"],
+                "survey_status": row["INSTALLED OR CANCELLED"]
+            }
+        )

From 25684622f96b36a56f0736210e530865b564c90d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 13:32:42 +0000
Subject: [PATCH 24/40] working through matching ha24

---
 etl/eligibility/ha_15_32/ha24_app.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
index 44cc27e2..3b4db9ca 100644
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -90,6 +90,11 @@ def load_data():
     # Tidy up the street/block name a bit
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip()
+
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "Council House/Nidds Lane", "NIDDS LANE"
+    )
 
     # Drop all None rows
     survey_list = survey_list.dropna(how='all')

From 02dc1241fbf26ddabd47d56e366dd17152341ac8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 15:03:36 +0000
Subject: [PATCH 25/40] completed matched for ha24

---
 etl/eligibility/ha_15_32/ha24_app.py | 34 ++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
index 3b4db9ca..a85ff5cf 100644
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -93,11 +93,38 @@ def load_data():
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip()
 
     survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
-        "Council House/Nidds Lane", "NIDDS LANE"
+        "council house, nidds lane", "nidds lane"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "wirral avenue", "wirrall avenue"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "st ives road", "st. ives crescent"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "sundringham road", "sandringham road"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "milton avenue", "milton road"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "st ives crescent", "st. ives crescent"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "council house, waterbelly lane", "waterbelly lane"
+    )
+    # Generally remove "councile house, " from the start of the street name
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "council house, ", ""
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "st. leodegars close", "st leodegars close"
     )
 
+    # asset_list[asset_list["Address"].str.lower().str.contains("wirral")]["Address"]
+
     # Drop all None rows
-    survey_list = survey_list.dropna(how='all')
+    survey_list = survey_list[~pd.isnull(survey_list["Street / Block Name"])]
     survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))]
 
     matched = []
@@ -115,6 +142,9 @@ def load_data():
             if df.shape[0] != 1:
                 df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
                 if df.shape[0] != 1:
+                    print(row["Street / Block Name"])
+                    print(house_number)
+                    print(row["Post Code"].lower())
                     raise ValueError("Investigate")
 
         matched.append(

From 8a27daf71b004358b8953edc40a8eb8a2c690d82 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 15:04:46 +0000
Subject: [PATCH 26/40] completed load_data, setup app and get_epc_data

---
 etl/eligibility/ha_15_32/ha24_app.py | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
index a85ff5cf..fd362930 100644
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -157,3 +157,38 @@ def load_data():
                 "survey_status": row["INSTALLED OR CANCELLED"]
             }
         )
+
+    matched = pd.DataFrame(matched)
+    matched["warmfront_identified"] = True
+
+    # Combine asset list and surveys
+    data = asset_list.merge(
+        matched, how="left", left_on="Address", right_on="matched_address",
+    )
+    data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
+
+    return data, survey_list
+
+
+def get_epc_data(data, cleaned, cleaning_data, created_at):
+    pass
+
+
+def app():
+    data, survey_list = load_data()
+
+    data["row_id"] = ["ha16_" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()
+
+    results_df, scoring_data, nodata = get_epc_data(data, cleaned, cleaning_data, created_at)

From 7878983dbea6e9bbf9d706621df73e451d638732 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 15:05:47 +0000
Subject: [PATCH 27/40] put in get_epc_data

---
 etl/eligibility/ha_15_32/ha24_app.py | 173 ++++++++++++++++++++++++++-
 1 file changed, 172 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
index fd362930..b53f01f4 100644
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -171,7 +171,178 @@ def load_data():
 
 
 def get_epc_data(data, cleaned, cleaning_data, created_at):
-    pass
+    scoring_data = []
+    results = []
+    nodata = []
+
+    for _, property_meta in tqdm(data.iterrows(), total=len(data)):
+        searcher = SearchEpc(
+            address1=property_meta["HouseNo"],
+            postcode=property_meta["Postcode"],
+            size=1000
+        )
+        searcher.search()
+
+        if searcher.data is None:
+            nodata.append(property_meta)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["Address"])
+        # We also want to get the penultimate epc
+        penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+        if not penultimate_epc:
+            penultimate_epc = newest_epc
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (
+            property_meta["warmfront_identified"]
+        ):
+            eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+            eligibility.check_gbis_warmfront()
+            eligibility.check_eco4_warmfront()
+            # If this is the case, we need to update the older epcs
+            older_epcs = [
+                x for x in older_epcs if x["lmk-key"] not in [newest_epc["lmk-key"], penultimate_epc["lmk-key"]]
+            ]
+
+        # Full checks
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            if eligibility.epc["uprn"] == "":
+                eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
+
+            scoring_dictionary = prepare_model_data_row(
+                property_id=property_meta["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        results.append(
+            {
+                "row_id": property_meta["row_id"],
+                "uprn": eligibility.epc["uprn"],
+                "Address": property_meta["Address"],
+                "Postcode": property_meta["Postcode"],
+                "property_type": eligibility.epc["property-type"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "cavity_type": eligibility.cavity["type"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+    scoring_df = pd.DataFrame(scoring_data)
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+    scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+    return results_df, scoring_data, nodata
 
 
 def app():

From f68256ee12f0976e4750f02553a57811b6707abf Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 15:37:16 +0000
Subject: [PATCH 28/40] Completed HA24

---
 etl/eligibility/ha_15_32/ha24_app.py | 58 +++++++++++++++++++++
 etl/eligibility/ha_15_32/ha25_app.py | 76 ++++++++++++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 etl/eligibility/ha_15_32/ha25_app.py

diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
index b53f01f4..dca2b60f 100644
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -345,6 +345,53 @@ def get_epc_data(data, cleaned, cleaning_data, created_at):
     return results_df, scoring_data, nodata
 
 
+def analyse_results(results_df, data, survey_list):
+    analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge(
+        results_df, how="left", on="row_id"
+    ).merge(
+        survey_list[["survey_key", survey_list.columns[0]]].rename(columns={survey_list.columns[0]: "funding_scheme"}),
+        how="left", on="survey_key"
+    )
+
+    warmfront_identified = analysis_data[analysis_data["warmfront_identified"]]
+
+    # Of the ECO jobs, what proportion to we get right
+    warmfront_identified_eco = warmfront_identified[
+        warmfront_identified["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"])
+    ]
+
+    eco_success_rate = warmfront_identified_eco["eco4_eligible"].sum() / warmfront_identified_eco.shape[0]
+
+    warmfront_identified_gbis = warmfront_identified[
+        warmfront_identified["funding_scheme"].isin(["ECO4 GBIS (ECO+)"])
+    ]
+
+    # No gbis for this
+    # gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0]
+
+    # Additional identified
+    additional_identified_eco = analysis_data[
+        (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False)
+        ]
+
+    additional_identified_eco["eligibility_classification"].value_counts()
+
+    additional_identified_gbis = analysis_data[
+        (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+    # Future
+    additional_identified_eco_future = analysis_data[
+        (analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False)
+        ].shape[0]
+    additional_identified_gbis_future = analysis_data[
+        (analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+
+
 def app():
     data, survey_list = load_data()
 
@@ -363,3 +410,14 @@ def app():
     created_at = datetime.now().isoformat()
 
     results_df, scoring_data, nodata = get_epc_data(data, cleaned, cleaning_data, created_at)
+
+    # Pickle results just in case
+    # import pickle
+    # with open("ha24.pickle", "wb") as f:
+    #     pickle.dump(
+    #         {
+    #             "scoring_data": scoring_data,
+    #             "results": results_df,
+    #             "nodata": nodata
+    #         }, f
+    #     )
diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
new file mode 100644
index 00000000..473ba9de
--- /dev/null
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -0,0 +1,76 @@
+import msgpack
+import openpyxl
+from openpyxl.styles.colors import COLOR_INDEX
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from backend.app.utils import read_parquet_from_s3
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+
+import re
+
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+def load_data():
+    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx')
+    sheet = workbook.active
+    sheet_colnames = [cell.value for cell in sheet[1]]
+
+    rows_data = []
+    rows_colors = []
+    for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        rows_data.append(row_data)
+        rows_colors.append(row_color)
+
+    asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
+    asset_list['row_color'] = rows_colors
+
+    asset_list["row_colour_name"] = np.where(
+        asset_list["row_color"] == "FFFF0000", "red",
+        np.where(asset_list["row_color"] == "FF00B050", "green", "yellow")
+    )
+
+    asset_list["row_colour_code"] = np.where(
+        asset_list["row_colour_name"] == "red", "does not meet criteria",
+        np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
+    )
+
+    # We analysis historical ECO3 survey list
+    eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
+    dir(eco3_survey_workbook)
+    eco3_survey_sheet = eco3_survey_workbook.active
+
+    eco3_survey_rows = []
+    eco3_survey_colors = []
+
+    for row in eco3_survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        eco3_survey_rows.append(row_data)
+        eco3_survey_colors.append(row_color)
+
+    # Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically
+    eco3_survey_list = pd.DataFrame(eco3_survey_rows, columns=[cell.value for cell in eco3_survey_sheet[1]])
+    eco3_survey_list["row_colour"] = eco3_survey_colors
+    # Remove rows where street name is missing
+    eco3_survey_list = eco3_survey_list[~pd.isnull(eco3_survey_list["Street / Block Name"])]
+
+    eco3_survey_list["INSTALLED OR CANCELLED"]

From d51e1c913d61368b2d256121f9af3c6917fd2ef7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Dec 2023 16:03:31 +0000
Subject: [PATCH 29/40] working on loading data for ha25

---
 etl/eligibility/ha_15_32/ha25_app.py | 113 +++++++++++++++++++++++++--
 1 file changed, 108 insertions(+), 5 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 473ba9de..244bb5fd 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -28,11 +28,24 @@ load_dotenv(ENV_FILE)
 def load_data():
     workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx')
     sheet = workbook.active
-    sheet_colnames = [cell.value for cell in sheet[1]]
+    # There are no colnames so we create them ourselves
+    sheet_colnames = [
+        "property_reference",
+        "address",
+        "tenure",
+        "property_type",
+        "unknown1",
+        "year_built",
+        "unknown2",
+        "heating_type",
+        "wall_type",
+        "roof_type",
+        "postcode"
+    ]
 
     rows_data = []
     rows_colors = []
-    for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+    for row in sheet.iter_rows(min_row=1, values_only=False):  # Assuming the first row is headers
         row_data = [cell.value for cell in row]  # This will get you the cell values
         row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
         # row_color = COLOR_INDEX[row_color]
@@ -54,8 +67,7 @@ def load_data():
 
     # We analysis historical ECO3 survey list
     eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
-    dir(eco3_survey_workbook)
-    eco3_survey_sheet = eco3_survey_workbook.active
+    eco3_survey_sheet = eco3_survey_workbook["CAVITY"]
 
     eco3_survey_rows = []
     eco3_survey_colors = []
@@ -72,5 +84,96 @@ def load_data():
     eco3_survey_list["row_colour"] = eco3_survey_colors
     # Remove rows where street name is missing
     eco3_survey_list = eco3_survey_list[~pd.isnull(eco3_survey_list["Street / Block Name"])]
+    # We need to parse the row colours
+    # We have the following mappings:
+    # FF7030A0: purple
+    # FF92D050: green
+    # FFFF0000: red
+    # FFFFFF00: yellow
+    # FF38FD23: green
+    eco3_survey_list["row_colour_name"] = np.where(
+        eco3_survey_list["row_colour"] == "FF7030A0", "purple",
+        np.where(eco3_survey_list["row_colour"] == "FF92D050", "green",
+                 np.where(eco3_survey_list["row_colour"] == "FFFF0000", "red",
+                          np.where(eco3_survey_list["row_colour"] == "FFFFFF00", "yellow",
+                                   np.where(eco3_survey_list["row_colour"] == "FF38FD23", "green", "unknown")
+                                   )
+                          )
+                 )
+    )
 
-    eco3_survey_list["INSTALLED OR CANCELLED"]
+    # We map the meaning:
+    # red: cancelled
+    # green: installed advised install complete
+    # purple: installer advised install complete + post works EPC
+    # yellow: filler row - drop
+    eco3_survey_list["row_colour_code"] = np.where(
+        eco3_survey_list["row_colour_name"] == "red", "cancelled",
+        np.where(eco3_survey_list["row_colour_name"] == "green", "installed advised install complete",
+                 np.where(eco3_survey_list["row_colour_name"] == "purple",
+                          "installer advised install complete + post works EPC",
+                          np.where(eco3_survey_list["row_colour_name"] == "yellow", "filler row - drop", "unknown")
+                          )
+                 )
+    )
+
+    # This is good enough for the indicative cancellation rates
+
+    # We now read in the indicative survey list which identified pospects for ECO4 works
+    eco4_survey_workbook = openpyxl.load_workbook(
+        f'etl/eligibility/ha_15_32/HESTIA - HA 25 ADHOC ISOLATED IDENTIFIED PROPERTIES FOR CWI.xlsx'
+    )
+    eco4_prospect_survey_sheet = eco4_survey_workbook["LiveWest"]
+
+    eco4_prospects_survey_rows = []
+    eco4_prospects_survey_colors = []
+
+    for row in eco4_prospect_survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        eco4_prospects_survey_rows.append(row_data)
+        eco4_prospects_survey_colors.append(row_color)
+
+    # Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically
+    eco4_prospects_survey_list = pd.DataFrame(
+        eco4_prospects_survey_rows, columns=[cell.value for cell in eco4_prospect_survey_sheet[1]]
+    )
+    eco4_prospects_survey_list["row_colour"] = eco4_prospects_survey_colors
+
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.lower()
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.strip()
+
+    eco4_prospects_survey_list = eco4_prospects_survey_list[~pd.isnull(eco4_prospects_survey_list["ADDRESS 1"])]
+    eco4_prospects_survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(eco4_prospects_survey_list))]
+
+    matched = []
+    for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
+        house_number = row["NO"]
+        if isinstance(house_number, str):
+            house_number = house_number.lower()
+
+        # Filter on the first line of the address
+        df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
+        # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+        df = df[df["Address"].str.lower().str.contains(str(house_number))]
+        if df.shape[0] != 1:
+            df = df[df["HouseNo"] == str(house_number)]
+            if df.shape[0] != 1:
+                df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                if df.shape[0] != 1:
+                    print(row["Street / Block Name"])
+                    print(house_number)
+                    print(row["Post Code"].lower())
+                    raise ValueError("Investigate")
+
+        matched.append(
+            {
+                "survey_key": row["survey_key"],
+                "matched_address": df["Address"].values[0],
+                "survey_house_no": row["NO."],
+                "survey_street_name": row["Street / Block Name"],
+                "survey_postcode": row["Post Code"],
+                "survey_status": row["INSTALLED OR CANCELLED"]
+            }
+        )

From 0e5c343319a658499539509e06139990cacc2a9e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 10:51:53 +0000
Subject: [PATCH 30/40] formatting asset list

---
 etl/eligibility/ha_15_32/ha25_app.py | 58 ++++++++++++++++------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 244bb5fd..88502e69 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -26,33 +26,31 @@ load_dotenv(ENV_FILE)
 
 
 def load_data():
-    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx')
+    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx', data_only=True)
     sheet = workbook.active
-    # There are no colnames so we create them ourselves
-    sheet_colnames = [
-        "property_reference",
-        "address",
-        "tenure",
-        "property_type",
-        "unknown1",
-        "year_built",
-        "unknown2",
-        "heating_type",
-        "wall_type",
-        "roof_type",
-        "postcode"
-    ]
 
     rows_data = []
     rows_colors = []
-    for row in sheet.iter_rows(min_row=1, values_only=False):  # Assuming the first row is headers
-        row_data = [cell.value for cell in row]  # This will get you the cell values
-        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
-        # row_color = COLOR_INDEX[row_color]
+    for row in sheet.iter_rows(min_row=1, values_only=True):  # use values_only=True to get values
+
+        row_data = list(row)  # No need for comprehension, values_only=True returns a tuple of values
         rows_data.append(row_data)
+
+    # Headers are on the final row. Pop them off and store them and then remove them from rows_data
+    headers = rows_data.pop()
+    # The postcode header is None, so we replace it with "postcode"
+    headers[-1] = "postcode"
+
+    # Handle colours separately
+    for row in sheet.iter_rows(min_row=1, values_only=False):
+        # Assume first cell color is indicative of entire row
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
         rows_colors.append(row_color)
 
-    asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
+    # Remove the final row of colours, which is the header
+    rows_colors.pop()
+
+    asset_list = pd.DataFrame(rows_data, columns=headers)
     asset_list['row_color'] = rows_colors
 
     asset_list["row_colour_name"] = np.where(
@@ -65,6 +63,19 @@ def load_data():
         np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
     )
 
+    asset_list["address"] = asset_list["T1_Address"].copy().str.lower()
+    asset_list["address"] = asset_list["address"].str.replace("flat", "")
+    asset_list["address"] = asset_list["address"].str.strip()
+
+    split_addresses = asset_list['address'].str.split(' ', expand=True)
+    split_addresses.columns = ['HouseNo', 'address2', 'address3', 'address4', 'address5', 'address6', 'address7',
+                               'address8',
+                               'address9', 'address10', 'address11', 'address12', 'address13']
+    split_addresses["HouseNo"] = split_addresses["HouseNo"].str.replace(";", "")
+
+    # We could re-concatenate but we only care about HouseNo for the moment
+    asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1)
+
     # We analysis historical ECO3 survey list
     eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
     eco3_survey_sheet = eco3_survey_workbook["CAVITY"]
@@ -154,13 +165,12 @@ def load_data():
             house_number = house_number.lower()
 
         # Filter on the first line of the address
-        df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
-        # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
-        df = df[df["Address"].str.lower().str.contains(str(house_number))]
+        df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy()
+        df = df[df["T1_Address"].str.lower().str.contains(str(house_number))]
         if df.shape[0] != 1:
             df = df[df["HouseNo"] == str(house_number)]
             if df.shape[0] != 1:
-                df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
                 if df.shape[0] != 1:
                     print(row["Street / Block Name"])
                     print(house_number)

From e49101767a742fdd48ce392476ef66afa7f78662 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 10:56:24 +0000
Subject: [PATCH 31/40] working on matching

---
 etl/eligibility/ha_15_32/ha25_app.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 88502e69..1ad650bc 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -70,7 +70,7 @@ def load_data():
     split_addresses = asset_list['address'].str.split(' ', expand=True)
     split_addresses.columns = ['HouseNo', 'address2', 'address3', 'address4', 'address5', 'address6', 'address7',
                                'address8',
-                               'address9', 'address10', 'address11', 'address12', 'address13']
+                               'address9', 'address10', 'address11', 'address12', 'address13', 'address14', ]
     split_addresses["HouseNo"] = split_addresses["HouseNo"].str.replace(";", "")
 
     # We could re-concatenate but we only care about HouseNo for the moment
@@ -164,6 +164,9 @@ def load_data():
         if isinstance(house_number, str):
             house_number = house_number.lower()
 
+            if "flat" in house_number:
+                house_number = house_number.split("flat")[1].strip()
+
         # Filter on the first line of the address
         df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy()
         df = df[df["T1_Address"].str.lower().str.contains(str(house_number))]
@@ -172,18 +175,17 @@ def load_data():
             if df.shape[0] != 1:
                 df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
                 if df.shape[0] != 1:
-                    print(row["Street / Block Name"])
+                    print(row["ADDRESS 1"])
                     print(house_number)
-                    print(row["Post Code"].lower())
+                    print(row["POSTCODE"].lower())
                     raise ValueError("Investigate")
 
         matched.append(
             {
                 "survey_key": row["survey_key"],
-                "matched_address": df["Address"].values[0],
-                "survey_house_no": row["NO."],
-                "survey_street_name": row["Street / Block Name"],
-                "survey_postcode": row["Post Code"],
-                "survey_status": row["INSTALLED OR CANCELLED"]
+                "matched_address": df["T1_Address"].values[0],
+                "survey_house_no": row["NO"],
+                "survey_street_name": row["ADDRESS 1"],
+                "survey_postcode": row["POSTCODE"],
             }
         )

From 4dbd76a4053b219fa5582ceb8e078b444babc325 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 11:13:20 +0000
Subject: [PATCH 32/40] debugging matching

---
 etl/eligibility/ha_15_32/ha25_app.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 1ad650bc..52d11a27 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -158,8 +158,29 @@ def load_data():
     eco4_prospects_survey_list = eco4_prospects_survey_list[~pd.isnull(eco4_prospects_survey_list["ADDRESS 1"])]
     eco4_prospects_survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(eco4_prospects_survey_list))]
 
+    # Correct some errors in the survey list
+    eco4_prospects_survey_list["POSTCODE"] = np.where(
+        (eco4_prospects_survey_list["ADDRESS 1"] == "berry park") &
+        (eco4_prospects_survey_list["POSTCODE"] == "PL12 6HP"),
+        "PL12 6EN",
+        eco4_prospects_survey_list["POSTCODE"]
+    )
+
+    # Remove semi colons from address in asset and survey list
+    asset_list["T1_Address"] = asset_list["T1_Address"].str.replace(";", "")
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(";", "")
+
+    # In the prosepcts survey list, we have 6 WALKHAM MEADOWS listed twice, which should be 6a and 6b
+    eco4_prospects_survey_list.loc[838, "NO"] = "6a"
+    eco4_prospects_survey_list.loc[839, "NO"] = "6b"
+
     matched = []
     for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
+
+        # Not in the survey list
+        if (row["ADDRESS 1"] == "berry park") and row["NO"] in [40, 42] and row["POSTCODE"] == "PL12 6EN":
+            continue
+
         house_number = row["NO"]
         if isinstance(house_number, str):
             house_number = house_number.lower()

From f2305fa9b22d15bd9b48e1e0c9ced179db148283 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 11:21:32 +0000
Subject: [PATCH 33/40] fixing matching wip

---
 etl/eligibility/ha_15_32/ha25_app.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 52d11a27..85cfe7c9 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -174,6 +174,14 @@ def load_data():
     eco4_prospects_survey_list.loc[838, "NO"] = "6a"
     eco4_prospects_survey_list.loc[839, "NO"] = "6b"
 
+    # 3, 7, 9 BOLDVENTURE ROAD should be BOLDVENTURE CLOSE
+    eco4_prospects_survey_list["ADDRESS 1"] = np.where(
+        (eco4_prospects_survey_list["ADDRESS 1"] == "boldventure road") &
+        (eco4_prospects_survey_list["NO"].isin([3, 7, 9])),
+        "boldventure close",
+        eco4_prospects_survey_list["ADDRESS 1"]
+    )
+
     matched = []
     for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
 

From dcc775719041e96c26c0003275da889fe8829774 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 11:36:04 +0000
Subject: [PATCH 34/40] 41% through matching

---
 etl/eligibility/ha_15_32/ha25_app.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 85cfe7c9..3ff43291 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -182,13 +182,24 @@ def load_data():
         eco4_prospects_survey_list["ADDRESS 1"]
     )
 
+    eco4_prospects_survey_list["ADDRESS 1"] = np.where(
+        (eco4_prospects_survey_list["ADDRESS 1"] == "old farm road") & (
+                eco4_prospects_survey_list["POSTCODE"] == "PL5 1EP"),
+        "old school road",
+        eco4_prospects_survey_list["ADDRESS 1"]
+    )
+
     matched = []
     for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
 
-        # Not in the survey list
+        # Not in the asset list
         if (row["ADDRESS 1"] == "berry park") and row["NO"] in [40, 42] and row["POSTCODE"] == "PL12 6EN":
             continue
 
+        # Not in the asset list
+        if (row["ADDRESS 1"] == "roberts road") and row["NO"] == 23 and row["POSTCODE"] == "PL5 1DP":
+            continue
+
         house_number = row["NO"]
         if isinstance(house_number, str):
             house_number = house_number.lower()

From c88afeb3301de36967845840689a8522ce605efd Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 11:45:14 +0000
Subject: [PATCH 35/40] 60% through matching

---
 etl/eligibility/ha_15_32/ha25_app.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 3ff43291..8536692a 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -184,11 +184,24 @@ def load_data():
 
     eco4_prospects_survey_list["ADDRESS 1"] = np.where(
         (eco4_prospects_survey_list["ADDRESS 1"] == "old farm road") & (
-                eco4_prospects_survey_list["POSTCODE"] == "PL5 1EP"),
+            eco4_prospects_survey_list["POSTCODE"] == "PL5 1EP"),
         "old school road",
         eco4_prospects_survey_list["ADDRESS 1"]
     )
 
+    eco4_prospects_survey_list["ADDRESS 1"] = np.where(
+        (eco4_prospects_survey_list["ADDRESS 1"] == "croft orchard") & (
+            eco4_prospects_survey_list["POSTCODE"] == "TQ12 6RP") & (
+            eco4_prospects_survey_list["NO"] == 52),
+        "drum way",
+        eco4_prospects_survey_list["ADDRESS 1"]
+    )
+
+    # String replace
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(
+        "the gulls, collaton road", "the gulls collaton road"
+    )
+
     matched = []
     for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
 
@@ -209,9 +222,12 @@ def load_data():
 
         # Filter on the first line of the address
         df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy()
-        df = df[df["T1_Address"].str.lower().str.contains(str(house_number))]
+        if house_number is not None:
+            if df.shape[0] != 1:
+                df = df[df["T1_Address"].str.lower().str.contains(str(house_number))]
         if df.shape[0] != 1:
-            df = df[df["HouseNo"] == str(house_number)]
+            if house_number is not None:
+                df = df[df["HouseNo"] == str(house_number)]
             if df.shape[0] != 1:
                 df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
                 if df.shape[0] != 1:

From d9a3ac37c9a6299a6ba732c1e51aaee8a59432bf Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 12:13:25 +0000
Subject: [PATCH 36/40] done with matching

---
 etl/eligibility/ha_15_32/ha25_app.py | 41 ++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 8536692a..2dd0b0b7 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -201,16 +201,31 @@ def load_data():
     eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(
         "the gulls, collaton road", "the gulls collaton road"
     )
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(
+        "crows-an-eglose", "crows-an-eglos"
+    )
 
+    # We have a high volume of rows that do not match
     matched = []
+    nomatch = []
     for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
 
         # Not in the asset list
         if (row["ADDRESS 1"] == "berry park") and row["NO"] in [40, 42] and row["POSTCODE"] == "PL12 6EN":
+            nomatch.append(row.to_dict())
             continue
 
         # Not in the asset list
         if (row["ADDRESS 1"] == "roberts road") and row["NO"] == 23 and row["POSTCODE"] == "PL5 1DP":
+            nomatch.append(row.to_dict())
+            continue
+
+        # Not in the asset list
+        if row["ADDRESS 1"] in [
+            "kaynton mead", "broadmoor lane", "hoopers barton", "ecos court", "selwood road",
+            "castle street"
+        ]:
+            nomatch.append(row.to_dict())
             continue
 
         house_number = row["NO"]
@@ -229,12 +244,11 @@ def load_data():
             if house_number is not None:
                 df = df[df["HouseNo"] == str(house_number)]
             if df.shape[0] != 1:
-                df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
+                if row["POSTCODE"] is not None:
+                    df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
                 if df.shape[0] != 1:
-                    print(row["ADDRESS 1"])
-                    print(house_number)
-                    print(row["POSTCODE"].lower())
-                    raise ValueError("Investigate")
+                    nomatch.append(row.to_dict())
+                    continue
 
         matched.append(
             {
@@ -245,3 +259,20 @@ def load_data():
                 "survey_postcode": row["POSTCODE"],
             }
         )
+
+    nomatch = pd.DataFrame(nomatch)
+    matched = pd.DataFrame(matched)
+
+    matched["warmfront_identified"] = True
+
+    # Combine asset list and surveys
+    data = asset_list.merge(
+        matched, how="left", left_on="T1_Address", right_on="matched_address",
+    )
+    data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
+
+    return data, eco4_prospects_survey_list
+
+
+def app():
+    data, eco4_prospects_survey_list = load_data()

From 65b39750a30d5714f15f9eb6e64688261ededc8e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 12:16:29 +0000
Subject: [PATCH 37/40] setting up app

---
 etl/eligibility/ha_15_32/ha24_app.py |   2 +-
 etl/eligibility/ha_15_32/ha25_app.py | 190 +++++++++++++++++++++++++++
 2 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py
index dca2b60f..3edf8735 100644
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@@ -395,7 +395,7 @@ def analyse_results(results_df, data, survey_list):
 def app():
     data, survey_list = load_data()
 
-    data["row_id"] = ["ha16_" + str(i) for i in range(0, len(data))]
+    data["row_id"] = ["ha24_" + str(i) for i in range(0, len(data))]
 
     cleaned = read_from_s3(
         s3_file_name="cleaned_epc_data/cleaned.bson",
diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 2dd0b0b7..b2465913 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -75,6 +75,7 @@ def load_data():
 
     # We could re-concatenate but we only care about HouseNo for the moment
     asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1)
+    asset_list["postcode"] = asset_list["postcode"].str.strip()
 
     # We analysis historical ECO3 survey list
     eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
@@ -274,5 +275,194 @@ def load_data():
     return data, eco4_prospects_survey_list
 
 
+def get_epc_data(data, cleaned, cleaning_data, created_at):
+    scoring_data = []
+    results = []
+    nodata = []
+
+    for _, property_meta in tqdm(data.iterrows(), total=len(data)):
+        searcher = SearchEpc(
+            address1=property_meta["HouseNo"],
+            postcode=property_meta["postcode"],
+            size=1000
+        )
+        searcher.search()
+
+        if searcher.data is None:
+            nodata.append(property_meta)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["Address"])
+        # We also want to get the penultimate epc
+        penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+        if not penultimate_epc:
+            penultimate_epc = newest_epc
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (
+            property_meta["warmfront_identified"]
+        ):
+            eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+            eligibility.check_gbis_warmfront()
+            eligibility.check_eco4_warmfront()
+            # If this is the case, we need to update the older epcs
+            older_epcs = [
+                x for x in older_epcs if x["lmk-key"] not in [newest_epc["lmk-key"], penultimate_epc["lmk-key"]]
+            ]
+
+        # Full checks
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            if eligibility.epc["uprn"] == "":
+                eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
+
+            scoring_dictionary = prepare_model_data_row(
+                property_id=property_meta["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        results.append(
+            {
+                "row_id": property_meta["row_id"],
+                "uprn": eligibility.epc["uprn"],
+                "Address": property_meta["Address"],
+                "Postcode": property_meta["Postcode"],
+                "property_type": eligibility.epc["property-type"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "cavity_type": eligibility.cavity["type"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+    scoring_df = pd.DataFrame(scoring_data)
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+    scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+    return results_df, scoring_data, nodata
+
+
 def app():
     data, eco4_prospects_survey_list = load_data()
+
+    data["row_id"] = ["ha25_" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()

From 776bca7c249ec49cd82db3c95ca340ce408b25ae Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 12:20:53 +0000
Subject: [PATCH 38/40] fixing get epc data

---
 backend/SearchEpc.py                 | 1 -
 etl/eligibility/ha_15_32/ha25_app.py | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index f1cda010..238ae465 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -143,7 +143,6 @@ class SearchEpc:
         if len(uprns) == 1:
             return rows
 
-        logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
         if property_type is not None:
             # We can do a filter on the property type
             rows_filtered = [r for r in rows if r["property-type"] == property_type]
diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index b2465913..07470b51 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -292,7 +292,7 @@ def get_epc_data(data, cleaned, cleaning_data, created_at):
             nodata.append(property_meta)
             continue
 
-        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["Address"])
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["T1_Address"])
         # We also want to get the penultimate epc
         penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
         if not penultimate_epc:
@@ -336,8 +336,8 @@ def get_epc_data(data, cleaned, cleaning_data, created_at):
             {
                 "row_id": property_meta["row_id"],
                 "uprn": eligibility.epc["uprn"],
-                "Address": property_meta["Address"],
-                "Postcode": property_meta["Postcode"],
+                "Address": property_meta["T1_Address"],
+                "Postcode": property_meta["postcode"],
                 "property_type": eligibility.epc["property-type"],
                 "gbis_eligible": eligibility.gbis_warmfront,
                 "eco4_eligible": eligibility.eco4_warmfront["eligible"],

From a40c1670cbf971a5dac633b7c0574df0268f0789 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 28 Dec 2023 14:08:40 +0000
Subject: [PATCH 39/40] completed ha25 analysis

---
 etl/eligibility/ha_15_32/ha25_app.py | 53 ++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py
index 07470b51..4d86a546 100644
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@@ -450,6 +450,46 @@ def get_epc_data(data, cleaned, cleaning_data, created_at):
     return results_df, scoring_data, nodata
 
 
+def analyse_results(results_df, data, eco4_prospects_survey_list):
+    analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge(
+        results_df, how="left", on="row_id"
+    )
+
+    warmfront_identified = analysis_data[analysis_data["warmfront_identified"]]
+
+    # Of the ECO jobs, what proportion to we get right
+
+    success_rate = (warmfront_identified["eco4_eligible"] | warmfront_identified["gbis_eligible"]).sum() / \
+                   warmfront_identified.shape[
+                       0]
+
+    # No gbis for this
+    # gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0]
+
+    # Additional identified
+    additional_identified_eco = analysis_data[
+        (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False)
+        ]
+
+    additional_identified_eco["eligibility_classification"].value_counts()
+
+    additional_identified_gbis = analysis_data[
+        (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+
+    # Future
+    additional_identified_eco_future = analysis_data[
+        (analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False)
+        ].shape[0]
+    additional_identified_gbis_future = analysis_data[
+        (analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+
+
 def app():
     data, eco4_prospects_survey_list = load_data()
 
@@ -466,3 +506,16 @@ def app():
     )
 
     created_at = datetime.now().isoformat()
+
+    results_df, scoring_data, nodata = get_epc_data(data, cleaned, cleaning_data, created_at)
+    # Pickle the outputs
+    # import pickle
+    # with open("ha25.pickle", "wb") as f:
+    #     pickle.dump(
+    #         {
+    #             "results_df": results_df,
+    #             "scoring_data": scoring_data,
+    #             "nodata": nodata
+    #         },
+    #         f
+    #     )

From 1f57ed0f9e7eb637099798949cd3743bd7b77fe5 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 29 Dec 2023 11:10:31 +0000
Subject: [PATCH 40/40] minor eligibility tweaks

---
 etl/eligibility/ha_15_32/app.py      | 57 ++++++++++++++++++++--------
 etl/eligibility/ha_15_32/ha33_app.py | 26 ++++++++-----
 etl/eligibility/ha_15_32/ha4_app.py  | 20 ++++++----
 3 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py
index ccceb05f..48bfeb2c 100644
--- a/etl/eligibility/ha_15_32/app.py
+++ b/etl/eligibility/ha_15_32/app.py
@@ -833,6 +833,18 @@ def analyse_ha_32_results(results, ha32, no_house_numbers):
         results_df["warmfront_identified"]
     ]
 
+    # Aggregates of no eco and gbis jobs identified
+    n_eco = results_df["eco4_eligible"].sum()
+    # Gbis is rows where eco4 is not eligible
+    n_gbis = results_df[
+        (results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False)
+        ]["gbis_eligible"].sum()
+
+    pipeline_potential = results_df[
+        (results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | (
+            results_df["gbis_eligible"] == True)
+        ]
+
     success_rate = warmfront_identified["gbis_eligible"].sum() / warmfront_identified.shape[0]
     # For HA32, this is 89%
 
@@ -890,8 +902,16 @@ def analyse_ha_32_results(results, ha32, no_house_numbers):
 
     new_possibilities = results_df[
         (~results_df["warmfront_identified"]) &
-        (results_df["gbis_eligible"] | results_df["eco4_eligible"]) &
-        (results_df["tenure"] == "Rented (social)")
+        (results_df["gbis_eligible"] | results_df["eco4_eligible"])
+        ].copy()
+
+    new_possibilities_eco = results_df[
+        (~results_df["warmfront_identified"]) &
+        (results_df["eco4_eligible"] == True)
+        ].copy()
+    new_possibilities_gbis = results_df[
+        (~results_df["warmfront_identified"]) &
+        (results_df["eco4_eligible"] == False) & (results_df["gbis_eligible"] == True)
         ].copy()
 
     future_possibilities_eco = results_df[
@@ -959,6 +979,11 @@ def analyse_ha_15_results(results_df, ha15, no_house_numbers):
         "eligibility_classification"].value_counts()
     # For HA15 this is 50.3%
 
+    pipeline_potential = results_df[
+        (results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | (
+            results_df["gbis_eligible"] == True)
+        ]
+
     # of the properties we identify, what is the mix of confidenc
 
     missed = results_df[
@@ -977,32 +1002,32 @@ def analyse_ha_15_results(results_df, ha15, no_house_numbers):
         missed["sap"] < 69
         ]
 
-    sap_low_enough["walls"].value_counts()
-    z = ha15[ha15["row_id"].isin(sap_too_high["row_id"].values)]
-
-    investigate_1 = ha15[ha15["row_id"].isin(sap_too_high["row_id"])][
-        ["row_id", "Postcode", "Address Line 1", "Address Line 2", "Address Line 3"]]
-
-    investigate_2 = ha15[ha15["row_id"].isin(sap_low_enough["row_id"])][
-        ["row_id", "Postcode", "Address Line 1", "Address Line 2", "Address Line 3"]]
-
-    missed["message"].value_counts()
+    # Aggregates of no eco and gbis jobs identified
+    n_eco = results_df["eco4_eligible"].sum()
+    # Gbis is rows where eco4 is not eligible
+    n_gbis = results_df[
+        (results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False)
+        ]["gbis_eligible"].sum()
 
     # We now look for properties that we identified, that were not identified by Warmfront
 
     new_possibilities = results_df[
         (~results_df["warmfront_identified"]) &
-        ((results_df["gbis_eligible"] == True) | (results_df["eco4_eligible"] == True)) &
-        (results_df["tenure"] == "Rented (social)")
+        ((results_df["gbis_eligible"] == True) | (results_df["eco4_eligible"] == True))
+        ].copy()
+
+    new_possibilities_eco = results_df[
+        (~results_df["warmfront_identified"]) &
+        (results_df["eco4_eligible"] == True)
         ].copy()
 
     # These are future possibilityies
-    new_possibilities_eco = results_df[
+    future_possibilities_eco = results_df[
         (~results_df["warmfront_identified"]) &
         (results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
         ].copy()
 
-    new_possibilities_gbis = results_df[
+    future_possibilities_gbis = results_df[
         (~results_df["warmfront_identified"]) &
         (results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & (
             ~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
diff --git a/etl/eligibility/ha_15_32/ha33_app.py b/etl/eligibility/ha_15_32/ha33_app.py
index 9af5eae2..42c8fa81 100644
--- a/etl/eligibility/ha_15_32/ha33_app.py
+++ b/etl/eligibility/ha_15_32/ha33_app.py
@@ -264,21 +264,21 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at):
 
 
 def analyse_ha_33(results_df, data):
-    results_df_social = results_df[results_df["tenure"] == "Rented (social)"]
+    # results_df_social = results_df[results_df["tenure"] == "Rented (social)"]
+    #
+    # results_df_social["tenure"].value_counts()
 
-    results_df_social["tenure"].value_counts()
+    data[data["row_id"].isin(results_df["row_id"].values)]["PROPERTY TYPE"].value_counts()
 
-    data[data["row_id"].isin(results_df_social["row_id"].values)]["PROPERTY TYPE"].value_counts()
+    n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum()
+    n_eco4 = results_df["eco4_eligible"].sum()
+    n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum()
 
-    n_identified = (results_df_social["gbis_eligible"] | results_df_social["eco4_eligible"]).sum()
-    n_eco4 = results_df_social["eco4_eligible"].sum()
-    n_gbis = results_df_social[~results_df_social["eco4_eligible"]]["gbis_eligible"].sum()
-
-    eco_eligibile = results_df_social[results_df_social["eco4_eligible"]]
+    eco_eligibile = results_df[results_df["eco4_eligible"]]
     eco_eligibile["walls"].value_counts()
     eco_eligibile["roof"].value_counts()
 
-    results_df_social[results_df_social["gbis_eligible"] | results_df_social["eco4_eligible"]]["tenure"].value_counts()
+    results_df[results_df["gbis_eligible"] | results_df["eco4_eligible"]]["tenure"].value_counts()
 
     results_df_social["eligibility_classification"].value_counts()
 
@@ -316,3 +316,11 @@ def app():
     created_at = datetime.now().isoformat()
 
     results_df, _, _ = get_ha_33data(data, cleaned, cleaning_data, created_at)
+
+    # Read in
+    import pickle
+    with open("ha33_results.pickle", "rb") as f:
+        data = pickle.load(f)
+    results_df = pd.DataFrame(data["results"])
+    scoring_data = data["scoring_data"]
+    nodata = data["nodata"]
diff --git a/etl/eligibility/ha_15_32/ha4_app.py b/etl/eligibility/ha_15_32/ha4_app.py
index 8a404eec..92b03539 100644
--- a/etl/eligibility/ha_15_32/ha4_app.py
+++ b/etl/eligibility/ha_15_32/ha4_app.py
@@ -241,15 +241,11 @@ def get_ha_4_data(data, cleaned, cleaning_data, created_at):
 
 
 def analyse_ha_4(results_df, data):
-    results_df_social = results_df[results_df["tenure"] == "Rented (social)"]
+    n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum()
+    n_eco4 = results_df["eco4_eligible"].sum()
+    n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum()
 
-    results_df_social["property_type"].value_counts()
-
-    n_identified = (results_df_social["gbis_eligible"] | results_df_social["eco4_eligible"]).sum()
-    n_eco4 = results_df_social["eco4_eligible"].sum()
-    n_gbis = results_df_social[~results_df_social["eco4_eligible"]]["gbis_eligible"].sum()
-
-    eco_eligibile = results_df_social[results_df_social["eco4_eligible"]]
+    eco_eligibile = results_df[results_df["eco4_eligible"]]
     eco_eligibile["eligibility_classification"].value_counts()
 
     future_possibilities_eco = results_df[
@@ -299,3 +295,11 @@ def app():
     #             "scoring_data": scoring_data,
     #             "nodata": nodata
     #         }, f)
+
+    # Read in
+    # import pickle
+    # with open("ha_4.pickle", "rb") as f:
+    #     data = pickle.load(f)
+    # results_df = data["results_df"]
+    # scoring_data = data["scoring_data"]
+    # nodata = data["nodata"]