Merge pull request #267 from Hestia-Homes/ha7-analysis

Ha7 analysis
2026-07-27 23:35:01 +00:00 · 2023-12-29 11:12:11 +00:00 · 2023-12-29 11:12:11 +00:00 · eeb3653afa
commit eeb3653afa
parent 7ad4c10804 1f57ed0f9e
9 changed files with 1824 additions and 36 deletions
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@ -143,7 +143,6 @@ class SearchEpc:
        if len(uprns) == 1:
            return rows

-        logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO")
        if property_type is not None:
            # We can do a filter on the property type
            rows_filtered = [r for r in rows if r["property-type"] == property_type]
@ -202,7 +201,9 @@ class SearchEpc:
            return {}, []

        if len(newest_response) != 1:
-            raise Exception("More than one result found for this address - investigate me")
+            # It is possible (but rare, and likely an error on EPC lodgement) that we have multiple EPCs that
+            # were lodged at the exact same time. In this case, we will take the first one
+            newest_response = [newest_response[0]]

        older_epcs = [epc for epc in list_of_epcs if epc["lmk-key"] != newest_response[0]["lmk-key"]]

--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@ -235,6 +235,14 @@ class Eligibility:
        }

    def suspended_floor_insulation(self):
+
+        if "no_data" in self.floor.keys():
+            if self.floor["no_data"]:
+                self.suspended_floor = {
+                    "suitability": False,
+                }
+                return
+
        is_suspended = self.floor["is_suspended"]
        is_insulated = self.floor["insulation_thickness"] in ["average", "above average"]

@ -244,6 +252,14 @@ class Eligibility:
        return

    def solid_floor_insulation(self):
+
+        if "no_data" in self.floor.keys():
+            if self.floor["no_data"]:
+                self.solid_floor = {
+                    "suitability": False,
+                }
+                return
+
        is_solid = self.floor["is_solid"]
        is_insulated = self.floor["insulation_thickness"] in ["average", "above average"]

@ -331,9 +347,10 @@ class Eligibility:
        is_eligible = self.cavity["suitability"] & self.loft["suitability"]

        if post_retrofit_sap is None:
+            message = "subject to post retrofit sap" if is_eligible else "not eligible"
            self.eco4_warmfront = {
                "eligible": is_eligible,
-                "message": "subject to post retrofit sap"
+                "message": message
            }
            return

--- a/etl/eligibility/ha_15_32/app.py
+++ b/etl/eligibility/ha_15_32/app.py
@ -833,6 +833,18 @@ def analyse_ha_32_results(results, ha32, no_house_numbers):
        results_df["warmfront_identified"]
    ]

+    # Aggregates of no eco and gbis jobs identified
+    n_eco = results_df["eco4_eligible"].sum()
+    # Gbis is rows where eco4 is not eligible
+    n_gbis = results_df[
+        (results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False)
+        ]["gbis_eligible"].sum()
+
+    pipeline_potential = results_df[
+        (results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | (
+            results_df["gbis_eligible"] == True)
+        ]
+
    success_rate = warmfront_identified["gbis_eligible"].sum() / warmfront_identified.shape[0]
    # For HA32, this is 89%

@ -890,8 +902,16 @@ def analyse_ha_32_results(results, ha32, no_house_numbers):

    new_possibilities = results_df[
        (~results_df["warmfront_identified"]) &
-        (results_df["gbis_eligible"] | results_df["eco4_eligible"]) &
-        (results_df["tenure"] == "Rented (social)")
+        (results_df["gbis_eligible"] | results_df["eco4_eligible"])
+        ].copy()
+
+    new_possibilities_eco = results_df[
+        (~results_df["warmfront_identified"]) &
+        (results_df["eco4_eligible"] == True)
+        ].copy()
+    new_possibilities_gbis = results_df[
+        (~results_df["warmfront_identified"]) &
+        (results_df["eco4_eligible"] == False) & (results_df["gbis_eligible"] == True)
        ].copy()

    future_possibilities_eco = results_df[
@ -959,6 +979,11 @@ def analyse_ha_15_results(results_df, ha15, no_house_numbers):
        "eligibility_classification"].value_counts()
    # For HA15 this is 50.3%

+    pipeline_potential = results_df[
+        (results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | (
+            results_df["gbis_eligible"] == True)
+        ]
+
    # of the properties we identify, what is the mix of confidenc

    missed = results_df[
@ -977,32 +1002,32 @@ def analyse_ha_15_results(results_df, ha15, no_house_numbers):
        missed["sap"] < 69
        ]

-    sap_low_enough["walls"].value_counts()
-    z = ha15[ha15["row_id"].isin(sap_too_high["row_id"].values)]
-
-    investigate_1 = ha15[ha15["row_id"].isin(sap_too_high["row_id"])][
-        ["row_id", "Postcode", "Address Line 1", "Address Line 2", "Address Line 3"]]
-
-    investigate_2 = ha15[ha15["row_id"].isin(sap_low_enough["row_id"])][
-        ["row_id", "Postcode", "Address Line 1", "Address Line 2", "Address Line 3"]]
-
-    missed["message"].value_counts()
+    # Aggregates of no eco and gbis jobs identified
+    n_eco = results_df["eco4_eligible"].sum()
+    # Gbis is rows where eco4 is not eligible
+    n_gbis = results_df[
+        (results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False)
+        ]["gbis_eligible"].sum()

    # We now look for properties that we identified, that were not identified by Warmfront

    new_possibilities = results_df[
        (~results_df["warmfront_identified"]) &
-        ((results_df["gbis_eligible"] == True) | (results_df["eco4_eligible"] == True)) &
-        (results_df["tenure"] == "Rented (social)")
+        ((results_df["gbis_eligible"] == True) | (results_df["eco4_eligible"] == True))
+        ].copy()
+
+    new_possibilities_eco = results_df[
+        (~results_df["warmfront_identified"]) &
+        (results_df["eco4_eligible"] == True)
        ].copy()

    # These are future possibilityies
-    new_possibilities_eco = results_df[
+    future_possibilities_eco = results_df[
        (~results_df["warmfront_identified"]) &
        (results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
        ].copy()

-    new_possibilities_gbis = results_df[
+    future_possibilities_gbis = results_df[
        (~results_df["warmfront_identified"]) &
        (results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & (
            ~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
--- a/etl/eligibility/ha_15_32/ha16_app.py
+++ b/etl/eligibility/ha_15_32/ha16_app.py
@ -0,0 +1,502 @@
+import msgpack
+import openpyxl
+from openpyxl.styles.colors import COLOR_INDEX
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from backend.app.utils import read_parquet_from_s3
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+
+import re
+
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+def load_data():
+    # This asset list is spread across two sheets, which we need to combine
+
+    asset_list_filenames = [
+        "HESTIA - HA 16 ASSET LIST PART 1 OF 2.xlsx",
+        "HESTIA - HA 16 ASSET LIST PART 2 OF 2.xlsx",
+    ]
+
+    # Prepare lists to collect rows data and their colors
+    rows_data = []
+    rows_colors = []
+    colnames = []
+    for asset_list_filename in asset_list_filenames:
+        workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/{asset_list_filename}')
+        sheet = workbook.active
+        sheet_colnames = [cell.value for cell in sheet[1]]
+        colnames.append(sheet_colnames)
+
+        for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+            row_data = [cell.value for cell in row]  # This will get you the cell values
+            row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+            # row_color = COLOR_INDEX[row_color]
+            rows_data.append(row_data)
+            rows_colors.append(row_color)
+
+    asset_list = pd.DataFrame(rows_data, columns=colnames[0])
+    # Remove None columns
+    asset_list = asset_list.iloc[:, 0:12]
+    asset_list['row_color'] = rows_colors
+
+    asset_list["row_colour_name"] = np.where(
+        asset_list["row_color"] == "FFFF0000", "red",
+        np.where(asset_list["row_color"] == "FF92D050", "green", "yellow")
+    )
+
+    # Split up the address on commas, which is useful for matching later
+    split_addresses = asset_list['Address'].str.split(',', expand=True)
+    split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']
+
+    asset_list = pd.concat([asset_list, split_addresses], axis=1)
+    # There is no commas separating house number and address 1
+    split_addresses2 = asset_list['temp'].str.split(' ', expand=True)
+    split_addresses2.columns = ['HouseNo', 'part1', 'part2', "part3", "part4"]
+    # We could re-concatenate but we only care about HouseNo for the moment
+    asset_list = pd.concat([asset_list, split_addresses2[["HouseNo"]]], axis=1)
+
+    # We now read in the survey list
+    survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA- HA 16 ECO4 SURVEY LIST.xlsx')
+    survey_sheet = survey_workbook.active
+
+    survey_rows = []
+    survey_colors = []
+
+    for row in survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        survey_rows.append(row_data)
+        survey_colors.append(row_color)
+
+    survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
+
+    # For the survey list, we don't need the colours, since there is a column called "INSTALLED OR CANCELLED"
+    # which describes the status of the property
+    survey_list["row_colour"] = survey_colors
+    survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))]
+    # Tidy up the street/block name a bit
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
+    survey_list["Street / Block Name"] = np.where(
+        survey_list["Street / Block Name"] == "REEDS RD",
+        "Reeds ROAD",
+        survey_list["Street / Block Name"]
+    )
+    # Replace " rd " with "road"
+    survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\brd\b', 'road', regex=True)
+
+    # Replace " , " with ", "
+    survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(
+        " , ", ', ',
+    )
+    # Fix "{place} ,{place}" with "{place}, {place}"
+    survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\s*,\s*', ', ', regex=True)
+    # Strip whitespace
+    survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip()
+
+    # Correct errors
+    survey_list["Post Code"] = np.where(
+        survey_list["Post Code"] == "M38 0SA",
+        "M38 9SA",
+        survey_list["Post Code"]
+    )
+
+    survey_list["Post Code"] = np.where(
+        (survey_list["Street / Block Name"] == "nelson drive") & (survey_list["Post Code"] == "M44 5JE"),
+        "M44 5JF",
+        survey_list["Post Code"]
+    )
+
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road", "chatley road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road",
+                                                                                        "plantation avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("how clough drive",
+                                                                                        "howclough drive")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane",
+                                                                                        "brookhurst lane")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("biirch road",
+                                                                                        "birch road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hadson road",
+                                                                                        "hodson road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("harbonne avennue",
+                                                                                        "narbonne avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cumberland road, cadishead",
+                                                                                        "cumberland avenue, cadishead")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("aston field drive",
+                                                                                        "ashton field drive")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wedgewood road",
+                                                                                        "wedgwood road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hamilton close",
+                                                                                        "hamilton avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("lichens crescent, fitton hill",
+                                                                                        "lichens crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill",
+                                                                                        "south croft")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(", fitton hill", "")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("firtree dr", "fir tree avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hawthorne road",
+                                                                                        "hawthorn crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue",
+                                                                                        "reins lee avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("westerhill road",
+                                                                                        "wester hill road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("st martins road",
+                                                                                        "saint martins road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("timperley avenue",
+                                                                                        "timperley close")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eastwood road",
+                                                                                        "eastwood avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("new road", "new street")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grassmere road",
+                                                                                        "grasmere road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hulton road",
+                                                                                        "hulton avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("beechfield avenue",
+                                                                                        "beechfield road")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue",
+                                                                                        "princes avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent",
+                                                                                        "edge fold crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("conniston avenue",
+                                                                                        "coniston avenue")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent",
+                                                                                        "blackthorn crescent")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road",
+                                                                                        "wellstock lane")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brackley avenue",
+                                                                                        "brackley street")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brook avenue swinton",
+                                                                                        "brook avenue, swinton")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("green avenue swinton",
+                                                                                        "green avenue, swinton")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grasmere avenue wardley",
+                                                                                        "grasmere avenue, wardley")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle",
+                                                                                        "mardale avenue, wardle")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("carleach grove",
+                                                                                        "cartleach Grove")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("arbour grove",
+                                                                                        "arbor Grove")
+
+    # Replacement for clively avenue 66-68
+    survey_list["NO."] = np.where(
+        survey_list["NO."] == "66-68",
+        "66",
+        survey_list["NO."]
+    )
+
+    # asset_list[asset_list["Address"].str.lower().str.contains("clively")]
+
+    # We now need to merge the survey list onto the asset list
+    # Could be easier just to do a search on each row, even though it's much slower
+    matched = []
+    for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
+
+        house_number = row["NO."]
+        if isinstance(house_number, str):
+            house_number = house_number.lower()
+
+        # Filter on the first line of the address
+        df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
+        # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+        df = df[df["Address"].str.lower().str.contains(str(house_number))]
+        if df.shape[0] != 1:
+            df = df[df["HouseNo"] == str(house_number)]
+            if df.shape[0] != 1:
+                df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                if df.shape[0] != 1:
+                    raise ValueError("Investigate")
+
+        matched.append(
+            {
+                "survey_key": row["survey_key"],
+                "matched_address": df["Address"].values[0],
+                "survey_house_no": row["NO."],
+                "survey_street_name": row["Street / Block Name"],
+                "survey_postcode": row["Post Code"],
+                "survey_status": row["INSTALLED OR CANCELLED"]
+            }
+        )
+
+    matched = pd.DataFrame(matched)
+    matched["warmfront_identified"] = True
+
+    # Combine asset list and surveys
+    data = asset_list.merge(
+        matched, how="left", left_on="Address", right_on="matched_address",
+    )
+    data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
+
+    return data, survey_list
+
+
+def get_epc_data(data, cleaned, cleaning_data, created_at):
+    scoring_data = []
+    results = []
+    nodata = []
+
+    for _, property_meta in tqdm(data.iterrows(), total=len(data)):
+        searcher = SearchEpc(
+            address1=property_meta["HouseNo"],
+            postcode=property_meta["Postcode"],
+            size=1000
+        )
+        searcher.search()
+
+        if searcher.data is None:
+            nodata.append(property_meta)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["Address"])
+        # We also want to get the penultimate epc
+        penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+        if not penultimate_epc:
+            penultimate_epc = newest_epc
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (
+            property_meta["warmfront_identified"]
+        ):
+            eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+            eligibility.check_gbis_warmfront()
+            eligibility.check_eco4_warmfront()
+            # If this is the case, we need to update the older epcs
+            older_epcs = [
+                x for x in older_epcs if x["lmk-key"] not in [newest_epc["lmk-key"], penultimate_epc["lmk-key"]]
+            ]
+
+        # Full checks
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            if eligibility.epc["uprn"] == "":
+                eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
+
+            scoring_dictionary = prepare_model_data_row(
+                property_id=property_meta["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        results.append(
+            {
+                "row_id": property_meta["row_id"],
+                "uprn": eligibility.epc["uprn"],
+                "Address": property_meta["Address"],
+                "Postcode": property_meta["Postcode"],
+                "property_type": eligibility.epc["property-type"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "cavity_type": eligibility.cavity["type"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+    scoring_df = pd.DataFrame(scoring_data)
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+    scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+    return results_df, scoring_data, nodata
+
+
+def analyse_results(results_df, data, survey_list):
+    analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge(
+        results_df, how="left", on="row_id"
+    ).merge(
+        survey_list[["survey_key", survey_list.columns[0]]].rename(columns={survey_list.columns[0]: "funding_scheme"}),
+        how="left", on="survey_key"
+    )
+
+    warmfront_identified = analysis_data[analysis_data["warmfront_identified"]]
+
+    # Of the ECO jobs, what proportion to we get right
+    warmfront_identified_eco = warmfront_identified[
+        warmfront_identified["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"])
+    ]
+
+    eco_success_rate = warmfront_identified_eco["eco4_eligible"].sum() / warmfront_identified_eco.shape[0]
+
+    warmfront_identified_gbis = warmfront_identified[
+        warmfront_identified["funding_scheme"].isin(["ECO4 GBIS (ECO+)"])
+    ]
+
+    gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0]
+
+    # Additional identified
+    additional_identified_eco = analysis_data[
+        (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False)
+        ]
+
+    additional_identified_eco["eligibility_classification"].value_counts()
+
+    additional_identified_gbis = analysis_data[
+        (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+    # Future
+    additional_identified_eco_future = analysis_data[
+        (analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False)
+        ].shape[0]
+    additional_identified_gbis_future = analysis_data[
+        (analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+
+
+def app():
+    data, survey_list = load_data()
+
+    data["row_id"] = ["ha16_" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()
+
+    results_df, scoring_data, nodata = get_epc_data(data, cleaned, cleaning_data, created_at)
+
+    # Store
+    # import pickle
+    # with open("ha16.pickle", "wb") as f:
+    #     pickle.dump(
+    #         {
+    #             "scoring_data": scoring_data,
+    #             "results": results_df,
+    #             "nodata": nodata
+    #         }, f
+    #     )
--- a/etl/eligibility/ha_15_32/ha24_app.py
+++ b/etl/eligibility/ha_15_32/ha24_app.py
@ -0,0 +1,423 @@
+import msgpack
+import openpyxl
+from openpyxl.styles.colors import COLOR_INDEX
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from backend.app.utils import read_parquet_from_s3
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+
+import re
+
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+def load_data():
+    workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 24 ASSET LIST.xlsx')
+    sheet = workbook.active
+    sheet_colnames = [cell.value for cell in sheet[1]]
+
+    rows_data = []
+    rows_colors = []
+    for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        rows_data.append(row_data)
+        rows_colors.append(row_color)
+
+    asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
+    # Remove None columns
+    asset_list = asset_list.iloc[:, 0:10]
+    asset_list['row_color'] = rows_colors
+
+    asset_list["row_colour_name"] = np.where(
+        asset_list["row_color"] == "FFFF0000", "red",
+        np.where(asset_list["row_color"] == "FF92D050", "green", "yellow")
+    )
+
+    asset_list["row_colour_code"] = np.where(
+        asset_list["row_colour_name"] == "red", "does not meet criteria",
+        np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
+    )
+
+    # The third column is listed as "Address" but it's actually the postcode". We have two Address columns so we
+    # change just the third
+    asset_list.columns.values[2] = "Postcode"
+
+    # Split up the address on commas, which is useful for matching later
+    split_addresses = asset_list['Address'].str.split(',', expand=True)
+    split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5', 'address6']
+
+    asset_list = pd.concat([asset_list, split_addresses], axis=1)
+    # There is no commas separating house number and address 1
+    split_addresses2 = asset_list['temp'].str.split(' ', expand=True)
+    split_addresses2.columns = ['HouseNo', 'part1', 'part2', "part3", "part4"]
+    # We could re-concatenate but we only care about HouseNo for the moment
+    asset_list = pd.concat([asset_list, split_addresses2[["HouseNo"]]], axis=1)
+
+    # Read in surveys
+    survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 24 ECO4 SURVEY LIST.xlsx')
+    survey_sheet = survey_workbook.active
+
+    survey_rows = []
+    survey_colors = []
+
+    for row in survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        survey_rows.append(row_data)
+        survey_colors.append(row_color)
+
+    survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
+
+    survey_list["row_colour"] = survey_colors
+    survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))]
+    # Tidy up the street/block name a bit
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip()
+
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "council house, nidds lane", "nidds lane"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "wirral avenue", "wirrall avenue"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "st ives road", "st. ives crescent"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "sundringham road", "sandringham road"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "milton avenue", "milton road"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "st ives crescent", "st. ives crescent"
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "council house, waterbelly lane", "waterbelly lane"
+    )
+    # Generally remove "councile house, " from the start of the street name
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "council house, ", ""
+    )
+    survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+        "st. leodegars close", "st leodegars close"
+    )
+
+    # asset_list[asset_list["Address"].str.lower().str.contains("wirral")]["Address"]
+
+    # Drop all None rows
+    survey_list = survey_list[~pd.isnull(survey_list["Street / Block Name"])]
+    survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))]
+
+    matched = []
+    for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
+        house_number = row["NO."]
+        if isinstance(house_number, str):
+            house_number = house_number.lower()
+
+        # Filter on the first line of the address
+        df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
+        # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+        df = df[df["Address"].str.lower().str.contains(str(house_number))]
+        if df.shape[0] != 1:
+            df = df[df["HouseNo"] == str(house_number)]
+            if df.shape[0] != 1:
+                df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                if df.shape[0] != 1:
+                    print(row["Street / Block Name"])
+                    print(house_number)
+                    print(row["Post Code"].lower())
+                    raise ValueError("Investigate")
+
+        matched.append(
+            {
+                "survey_key": row["survey_key"],
+                "matched_address": df["Address"].values[0],
+                "survey_house_no": row["NO."],
+                "survey_street_name": row["Street / Block Name"],
+                "survey_postcode": row["Post Code"],
+                "survey_status": row["INSTALLED OR CANCELLED"]
+            }
+        )
+
+    matched = pd.DataFrame(matched)
+    matched["warmfront_identified"] = True
+
+    # Combine asset list and surveys
+    data = asset_list.merge(
+        matched, how="left", left_on="Address", right_on="matched_address",
+    )
+    data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
+
+    return data, survey_list
+
+
+def get_epc_data(data, cleaned, cleaning_data, created_at):
+    scoring_data = []
+    results = []
+    nodata = []
+
+    for _, property_meta in tqdm(data.iterrows(), total=len(data)):
+        searcher = SearchEpc(
+            address1=property_meta["HouseNo"],
+            postcode=property_meta["Postcode"],
+            size=1000
+        )
+        searcher.search()
+
+        if searcher.data is None:
+            nodata.append(property_meta)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["Address"])
+        # We also want to get the penultimate epc
+        penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+        if not penultimate_epc:
+            penultimate_epc = newest_epc
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (
+            property_meta["warmfront_identified"]
+        ):
+            eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+            eligibility.check_gbis_warmfront()
+            eligibility.check_eco4_warmfront()
+            # If this is the case, we need to update the older epcs
+            older_epcs = [
+                x for x in older_epcs if x["lmk-key"] not in [newest_epc["lmk-key"], penultimate_epc["lmk-key"]]
+            ]
+
+        # Full checks
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            if eligibility.epc["uprn"] == "":
+                eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
+
+            scoring_dictionary = prepare_model_data_row(
+                property_id=property_meta["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        results.append(
+            {
+                "row_id": property_meta["row_id"],
+                "uprn": eligibility.epc["uprn"],
+                "Address": property_meta["Address"],
+                "Postcode": property_meta["Postcode"],
+                "property_type": eligibility.epc["property-type"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "cavity_type": eligibility.cavity["type"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+    scoring_df = pd.DataFrame(scoring_data)
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+    scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+    return results_df, scoring_data, nodata
+
+
+def analyse_results(results_df, data, survey_list):
+    analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge(
+        results_df, how="left", on="row_id"
+    ).merge(
+        survey_list[["survey_key", survey_list.columns[0]]].rename(columns={survey_list.columns[0]: "funding_scheme"}),
+        how="left", on="survey_key"
+    )
+
+    warmfront_identified = analysis_data[analysis_data["warmfront_identified"]]
+
+    # Of the ECO jobs, what proportion to we get right
+    warmfront_identified_eco = warmfront_identified[
+        warmfront_identified["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"])
+    ]
+
+    eco_success_rate = warmfront_identified_eco["eco4_eligible"].sum() / warmfront_identified_eco.shape[0]
+
+    warmfront_identified_gbis = warmfront_identified[
+        warmfront_identified["funding_scheme"].isin(["ECO4 GBIS (ECO+)"])
+    ]
+
+    # No gbis for this
+    # gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0]
+
+    # Additional identified
+    additional_identified_eco = analysis_data[
+        (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False)
+        ]
+
+    additional_identified_eco["eligibility_classification"].value_counts()
+
+    additional_identified_gbis = analysis_data[
+        (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+    # Future
+    additional_identified_eco_future = analysis_data[
+        (analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False)
+        ].shape[0]
+    additional_identified_gbis_future = analysis_data[
+        (analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+
+
+def app():
+    data, survey_list = load_data()
+
+    data["row_id"] = ["ha24_" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()
+
+    results_df, scoring_data, nodata = get_epc_data(data, cleaned, cleaning_data, created_at)
+
+    # Pickle results just in case
+    # import pickle
+    # with open("ha24.pickle", "wb") as f:
+    #     pickle.dump(
+    #         {
+    #             "scoring_data": scoring_data,
+    #             "results": results_df,
+    #             "nodata": nodata
+    #         }, f
+    #     )
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@ -0,0 +1,521 @@
+import msgpack
+import openpyxl
+from openpyxl.styles.colors import COLOR_INDEX
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from backend.app.utils import read_parquet_from_s3
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+
+import re
+
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+def load_data():
+    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx', data_only=True)
+    sheet = workbook.active
+
+    rows_data = []
+    rows_colors = []
+    for row in sheet.iter_rows(min_row=1, values_only=True):  # use values_only=True to get values
+
+        row_data = list(row)  # No need for comprehension, values_only=True returns a tuple of values
+        rows_data.append(row_data)
+
+    # Headers are on the final row. Pop them off and store them and then remove them from rows_data
+    headers = rows_data.pop()
+    # The postcode header is None, so we replace it with "postcode"
+    headers[-1] = "postcode"
+
+    # Handle colours separately
+    for row in sheet.iter_rows(min_row=1, values_only=False):
+        # Assume first cell color is indicative of entire row
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        rows_colors.append(row_color)
+
+    # Remove the final row of colours, which is the header
+    rows_colors.pop()
+
+    asset_list = pd.DataFrame(rows_data, columns=headers)
+    asset_list['row_color'] = rows_colors
+
+    asset_list["row_colour_name"] = np.where(
+        asset_list["row_color"] == "FFFF0000", "red",
+        np.where(asset_list["row_color"] == "FF00B050", "green", "yellow")
+    )
+
+    asset_list["row_colour_code"] = np.where(
+        asset_list["row_colour_name"] == "red", "does not meet criteria",
+        np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
+    )
+
+    asset_list["address"] = asset_list["T1_Address"].copy().str.lower()
+    asset_list["address"] = asset_list["address"].str.replace("flat", "")
+    asset_list["address"] = asset_list["address"].str.strip()
+
+    split_addresses = asset_list['address'].str.split(' ', expand=True)
+    split_addresses.columns = ['HouseNo', 'address2', 'address3', 'address4', 'address5', 'address6', 'address7',
+                               'address8',
+                               'address9', 'address10', 'address11', 'address12', 'address13', 'address14', ]
+    split_addresses["HouseNo"] = split_addresses["HouseNo"].str.replace(";", "")
+
+    # We could re-concatenate but we only care about HouseNo for the moment
+    asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1)
+    asset_list["postcode"] = asset_list["postcode"].str.strip()
+
+    # We analysis historical ECO3 survey list
+    eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
+    eco3_survey_sheet = eco3_survey_workbook["CAVITY"]
+
+    eco3_survey_rows = []
+    eco3_survey_colors = []
+
+    for row in eco3_survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        eco3_survey_rows.append(row_data)
+        eco3_survey_colors.append(row_color)
+
+    # Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically
+    eco3_survey_list = pd.DataFrame(eco3_survey_rows, columns=[cell.value for cell in eco3_survey_sheet[1]])
+    eco3_survey_list["row_colour"] = eco3_survey_colors
+    # Remove rows where street name is missing
+    eco3_survey_list = eco3_survey_list[~pd.isnull(eco3_survey_list["Street / Block Name"])]
+    # We need to parse the row colours
+    # We have the following mappings:
+    # FF7030A0: purple
+    # FF92D050: green
+    # FFFF0000: red
+    # FFFFFF00: yellow
+    # FF38FD23: green
+    eco3_survey_list["row_colour_name"] = np.where(
+        eco3_survey_list["row_colour"] == "FF7030A0", "purple",
+        np.where(eco3_survey_list["row_colour"] == "FF92D050", "green",
+                 np.where(eco3_survey_list["row_colour"] == "FFFF0000", "red",
+                          np.where(eco3_survey_list["row_colour"] == "FFFFFF00", "yellow",
+                                   np.where(eco3_survey_list["row_colour"] == "FF38FD23", "green", "unknown")
+                                   )
+                          )
+                 )
+    )
+
+    # We map the meaning:
+    # red: cancelled
+    # green: installed advised install complete
+    # purple: installer advised install complete + post works EPC
+    # yellow: filler row - drop
+    eco3_survey_list["row_colour_code"] = np.where(
+        eco3_survey_list["row_colour_name"] == "red", "cancelled",
+        np.where(eco3_survey_list["row_colour_name"] == "green", "installed advised install complete",
+                 np.where(eco3_survey_list["row_colour_name"] == "purple",
+                          "installer advised install complete + post works EPC",
+                          np.where(eco3_survey_list["row_colour_name"] == "yellow", "filler row - drop", "unknown")
+                          )
+                 )
+    )
+
+    # This is good enough for the indicative cancellation rates
+
+    # We now read in the indicative survey list which identified pospects for ECO4 works
+    eco4_survey_workbook = openpyxl.load_workbook(
+        f'etl/eligibility/ha_15_32/HESTIA - HA 25 ADHOC ISOLATED IDENTIFIED PROPERTIES FOR CWI.xlsx'
+    )
+    eco4_prospect_survey_sheet = eco4_survey_workbook["LiveWest"]
+
+    eco4_prospects_survey_rows = []
+    eco4_prospects_survey_colors = []
+
+    for row in eco4_prospect_survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        eco4_prospects_survey_rows.append(row_data)
+        eco4_prospects_survey_colors.append(row_color)
+
+    # Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically
+    eco4_prospects_survey_list = pd.DataFrame(
+        eco4_prospects_survey_rows, columns=[cell.value for cell in eco4_prospect_survey_sheet[1]]
+    )
+    eco4_prospects_survey_list["row_colour"] = eco4_prospects_survey_colors
+
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.lower()
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.strip()
+
+    eco4_prospects_survey_list = eco4_prospects_survey_list[~pd.isnull(eco4_prospects_survey_list["ADDRESS 1"])]
+    eco4_prospects_survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(eco4_prospects_survey_list))]
+
+    # Correct some errors in the survey list
+    eco4_prospects_survey_list["POSTCODE"] = np.where(
+        (eco4_prospects_survey_list["ADDRESS 1"] == "berry park") &
+        (eco4_prospects_survey_list["POSTCODE"] == "PL12 6HP"),
+        "PL12 6EN",
+        eco4_prospects_survey_list["POSTCODE"]
+    )
+
+    # Remove semi colons from address in asset and survey list
+    asset_list["T1_Address"] = asset_list["T1_Address"].str.replace(";", "")
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(";", "")
+
+    # In the prosepcts survey list, we have 6 WALKHAM MEADOWS listed twice, which should be 6a and 6b
+    eco4_prospects_survey_list.loc[838, "NO"] = "6a"
+    eco4_prospects_survey_list.loc[839, "NO"] = "6b"
+
+    # 3, 7, 9 BOLDVENTURE ROAD should be BOLDVENTURE CLOSE
+    eco4_prospects_survey_list["ADDRESS 1"] = np.where(
+        (eco4_prospects_survey_list["ADDRESS 1"] == "boldventure road") &
+        (eco4_prospects_survey_list["NO"].isin([3, 7, 9])),
+        "boldventure close",
+        eco4_prospects_survey_list["ADDRESS 1"]
+    )
+
+    eco4_prospects_survey_list["ADDRESS 1"] = np.where(
+        (eco4_prospects_survey_list["ADDRESS 1"] == "old farm road") & (
+            eco4_prospects_survey_list["POSTCODE"] == "PL5 1EP"),
+        "old school road",
+        eco4_prospects_survey_list["ADDRESS 1"]
+    )
+
+    eco4_prospects_survey_list["ADDRESS 1"] = np.where(
+        (eco4_prospects_survey_list["ADDRESS 1"] == "croft orchard") & (
+            eco4_prospects_survey_list["POSTCODE"] == "TQ12 6RP") & (
+            eco4_prospects_survey_list["NO"] == 52),
+        "drum way",
+        eco4_prospects_survey_list["ADDRESS 1"]
+    )
+
+    # String replace
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(
+        "the gulls, collaton road", "the gulls collaton road"
+    )
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(
+        "crows-an-eglose", "crows-an-eglos"
+    )
+
+    # We have a high volume of rows that do not match
+    matched = []
+    nomatch = []
+    for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
+
+        # Not in the asset list
+        if (row["ADDRESS 1"] == "berry park") and row["NO"] in [40, 42] and row["POSTCODE"] == "PL12 6EN":
+            nomatch.append(row.to_dict())
+            continue
+
+        # Not in the asset list
+        if (row["ADDRESS 1"] == "roberts road") and row["NO"] == 23 and row["POSTCODE"] == "PL5 1DP":
+            nomatch.append(row.to_dict())
+            continue
+
+        # Not in the asset list
+        if row["ADDRESS 1"] in [
+            "kaynton mead", "broadmoor lane", "hoopers barton", "ecos court", "selwood road",
+            "castle street"
+        ]:
+            nomatch.append(row.to_dict())
+            continue
+
+        house_number = row["NO"]
+        if isinstance(house_number, str):
+            house_number = house_number.lower()
+
+            if "flat" in house_number:
+                house_number = house_number.split("flat")[1].strip()
+
+        # Filter on the first line of the address
+        df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy()
+        if house_number is not None:
+            if df.shape[0] != 1:
+                df = df[df["T1_Address"].str.lower().str.contains(str(house_number))]
+        if df.shape[0] != 1:
+            if house_number is not None:
+                df = df[df["HouseNo"] == str(house_number)]
+            if df.shape[0] != 1:
+                if row["POSTCODE"] is not None:
+                    df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
+                if df.shape[0] != 1:
+                    nomatch.append(row.to_dict())
+                    continue
+
+        matched.append(
+            {
+                "survey_key": row["survey_key"],
+                "matched_address": df["T1_Address"].values[0],
+                "survey_house_no": row["NO"],
+                "survey_street_name": row["ADDRESS 1"],
+                "survey_postcode": row["POSTCODE"],
+            }
+        )
+
+    nomatch = pd.DataFrame(nomatch)
+    matched = pd.DataFrame(matched)
+
+    matched["warmfront_identified"] = True
+
+    # Combine asset list and surveys
+    data = asset_list.merge(
+        matched, how="left", left_on="T1_Address", right_on="matched_address",
+    )
+    data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
+
+    return data, eco4_prospects_survey_list
+
+
+def get_epc_data(data, cleaned, cleaning_data, created_at):
+    scoring_data = []
+    results = []
+    nodata = []
+
+    for _, property_meta in tqdm(data.iterrows(), total=len(data)):
+        searcher = SearchEpc(
+            address1=property_meta["HouseNo"],
+            postcode=property_meta["postcode"],
+            size=1000
+        )
+        searcher.search()
+
+        if searcher.data is None:
+            nodata.append(property_meta)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(address=property_meta["T1_Address"])
+        # We also want to get the penultimate epc
+        penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+        if not penultimate_epc:
+            penultimate_epc = newest_epc
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (
+            property_meta["warmfront_identified"]
+        ):
+            eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+            eligibility.check_gbis_warmfront()
+            eligibility.check_eco4_warmfront()
+            # If this is the case, we need to update the older epcs
+            older_epcs = [
+                x for x in older_epcs if x["lmk-key"] not in [newest_epc["lmk-key"], penultimate_epc["lmk-key"]]
+            ]
+
+        # Full checks
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            if eligibility.epc["uprn"] == "":
+                eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
+
+            scoring_dictionary = prepare_model_data_row(
+                property_id=property_meta["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        results.append(
+            {
+                "row_id": property_meta["row_id"],
+                "uprn": eligibility.epc["uprn"],
+                "Address": property_meta["T1_Address"],
+                "Postcode": property_meta["postcode"],
+                "property_type": eligibility.epc["property-type"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "cavity_type": eligibility.cavity["type"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+    scoring_df = pd.DataFrame(scoring_data)
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+    scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+    return results_df, scoring_data, nodata
+
+
+def analyse_results(results_df, data, eco4_prospects_survey_list):
+    analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge(
+        results_df, how="left", on="row_id"
+    )
+
+    warmfront_identified = analysis_data[analysis_data["warmfront_identified"]]
+
+    # Of the ECO jobs, what proportion to we get right
+
+    success_rate = (warmfront_identified["eco4_eligible"] | warmfront_identified["gbis_eligible"]).sum() / \
+                   warmfront_identified.shape[
+                       0]
+
+    # No gbis for this
+    # gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0]
+
+    # Additional identified
+    additional_identified_eco = analysis_data[
+        (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False)
+        ]
+
+    additional_identified_eco["eligibility_classification"].value_counts()
+
+    additional_identified_gbis = analysis_data[
+        (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+
+    # Future
+    additional_identified_eco_future = analysis_data[
+        (analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False)
+        ].shape[0]
+    additional_identified_gbis_future = analysis_data[
+        (analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & (
+            analysis_data["warmfront_identified"] == False
+        )
+        ].shape[0]
+
+
+def app():
+    data, eco4_prospects_survey_list = load_data()
+
+    data["row_id"] = ["ha25_" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()
+
+    results_df, scoring_data, nodata = get_epc_data(data, cleaned, cleaning_data, created_at)
+    # Pickle the outputs
+    # import pickle
+    # with open("ha25.pickle", "wb") as f:
+    #     pickle.dump(
+    #         {
+    #             "results_df": results_df,
+    #             "scoring_data": scoring_data,
+    #             "nodata": nodata
+    #         },
+    #         f
+    #     )
--- a/etl/eligibility/ha_15_32/ha33_app.py
+++ b/etl/eligibility/ha_15_32/ha33_app.py
@ -264,21 +264,21 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at):


 def analyse_ha_33(results_df, data):
-    results_df_social = results_df[results_df["tenure"] == "Rented (social)"]
+    # results_df_social = results_df[results_df["tenure"] == "Rented (social)"]
+    #
+    # results_df_social["tenure"].value_counts()

-    results_df_social["tenure"].value_counts()
+    data[data["row_id"].isin(results_df["row_id"].values)]["PROPERTY TYPE"].value_counts()

-    data[data["row_id"].isin(results_df_social["row_id"].values)]["PROPERTY TYPE"].value_counts()
+    n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum()
+    n_eco4 = results_df["eco4_eligible"].sum()
+    n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum()

-    n_identified = (results_df_social["gbis_eligible"] | results_df_social["eco4_eligible"]).sum()
-    n_eco4 = results_df_social["eco4_eligible"].sum()
-    n_gbis = results_df_social[~results_df_social["eco4_eligible"]]["gbis_eligible"].sum()
-
-    eco_eligibile = results_df_social[results_df_social["eco4_eligible"]]
+    eco_eligibile = results_df[results_df["eco4_eligible"]]
    eco_eligibile["walls"].value_counts()
    eco_eligibile["roof"].value_counts()

-    results_df_social[results_df_social["gbis_eligible"] | results_df_social["eco4_eligible"]]["tenure"].value_counts()
+    results_df[results_df["gbis_eligible"] | results_df["eco4_eligible"]]["tenure"].value_counts()

    results_df_social["eligibility_classification"].value_counts()

@ -316,3 +316,11 @@ def app():
    created_at = datetime.now().isoformat()

    results_df, _, _ = get_ha_33data(data, cleaned, cleaning_data, created_at)
+
+    # Read in
+    import pickle
+    with open("ha33_results.pickle", "rb") as f:
+        data = pickle.load(f)
+    results_df = pd.DataFrame(data["results"])
+    scoring_data = data["scoring_data"]
+    nodata = data["nodata"]
--- a/etl/eligibility/ha_15_32/ha4_app.py
+++ b/etl/eligibility/ha_15_32/ha4_app.py
@ -241,15 +241,11 @@ def get_ha_4_data(data, cleaned, cleaning_data, created_at):


 def analyse_ha_4(results_df, data):
-    results_df_social = results_df[results_df["tenure"] == "Rented (social)"]
+    n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum()
+    n_eco4 = results_df["eco4_eligible"].sum()
+    n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum()

-    results_df_social["property_type"].value_counts()
-
-    n_identified = (results_df_social["gbis_eligible"] | results_df_social["eco4_eligible"]).sum()
-    n_eco4 = results_df_social["eco4_eligible"].sum()
-    n_gbis = results_df_social[~results_df_social["eco4_eligible"]]["gbis_eligible"].sum()
-
-    eco_eligibile = results_df_social[results_df_social["eco4_eligible"]]
+    eco_eligibile = results_df[results_df["eco4_eligible"]]
    eco_eligibile["eligibility_classification"].value_counts()

    future_possibilities_eco = results_df[
@ -299,3 +295,11 @@ def app():
    #             "scoring_data": scoring_data,
    #             "nodata": nodata
    #         }, f)
+
+    # Read in
+    # import pickle
+    # with open("ha_4.pickle", "rb") as f:
+    #     data = pickle.load(f)
+    # results_df = data["results_df"]
+    # scoring_data = data["scoring_data"]
+    # nodata = data["nodata"]
--- a/etl/eligibility/ha_15_32/ha7_app.py
+++ b/etl/eligibility/ha_15_32/ha7_app.py
@ -0,0 +1,287 @@
+import msgpack
+import openpyxl
+from openpyxl.styles.colors import COLOR_INDEX
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from utils.s3 import read_from_s3
+from utils.logger import setup_logger
+from dotenv import load_dotenv
+from backend.app.utils import read_parquet_from_s3
+from tqdm import tqdm
+from backend.SearchEpc import SearchEpc
+from etl.eligibility.Eligibility import Eligibility
+from etl.eligibility.ha_15_32.app import prepare_model_data_row
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
+from backend.ml_models.api import ModelApi
+
+import re
+
+ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
+
+logger = setup_logger()
+load_dotenv(ENV_FILE)
+
+
+def load_data():
+    """
+    Load the data from the excel
+    """
+
+    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 7 ASSET LIST.xlsx')
+    sheet = workbook.active
+
+    # Prepare lists to collect rows data and their colors
+    rows_data = []
+    rows_colors = []
+    for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        row_color = COLOR_INDEX[row_color]
+        rows_data.append(row_data)
+        rows_colors.append(row_color)
+
+    df = pd.DataFrame(rows_data, columns=[cell.value for cell in sheet[1]])
+
+    # Add the row colors as a new column
+    df['row_color'] = rows_colors
+    df.columns.values[8] = "is_active"
+
+    # Remove None columns
+    df = df.dropna(axis=1, how='all')
+    # We now parse the colours
+    df["row_color"].unique()
+    df["row_colour_name"] = np.where(
+        df["row_color"] == "0000FFFF", "red",
+        np.where(df["row_color"] == "00FF00FF", "green", "yellow")
+    )
+    df["row_code"] = np.where(
+        df["row_colour_name"] == "red", "invalid",
+        np.where(df["row_colour_name"] == "green", "potential ECO4", "needs criteria change")
+    )
+
+    return df
+
+
+def get_ha7_data(data, cleaned, cleaning_data, created_at):
+    property_type_lookup = {
+        "Mid Terrace": "Mid-Terrace",
+        "End Terrace": "End-Terrace",
+        "Semi Detached": "Semi-Detached",
+        "Detached": "Detached",
+    }
+
+    scoring_data = []
+    results = []
+    nodata = []
+    for _, house in tqdm(data.iterrows(), total=len(data)):
+
+        searcher = SearchEpc(
+            address1=house["Address"],
+            postcode=house["Postcode"]
+        )
+
+        response = searcher.search()
+        if response["status"] == 204:
+            nodata.append(house)
+            continue
+
+        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(
+            property_type=property_type_lookup.get(house["Property Type"], None),
+            address=house["Address"],
+        )
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis_warmfront()
+        eligibility.check_eco4_warmfront()
+
+        # If the house is not identified, we do a full gbis and eco4 check
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        if eligibility.eco4_warmfront["eligible"]:
+            scoring_dictionary = prepare_model_data_row(
+                property_id=house["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at,
+                old_data=older_epcs,
+                full_sap_epc=full_sap_epc
+            )
+            scoring_data.extend(scoring_dictionary)
+
+        # If nothing is eligible or gbis is eligible, then we make a record this
+        results.append(
+            {
+                "row_id": house["row_id"],
+                "address": house["Address"],
+                "postcode": house["Postcode"],
+                "gbis_eligible": eligibility.gbis_warmfront,
+                "eco4_eligible": eligibility.eco4_warmfront["eligible"],
+                "eco4_message": eligibility.eco4_warmfront["message"],
+                "sap": float(eligibility.epc["current-energy-efficiency"]),
+                "gbis_eligible_future": eligibility.gbis["eligible"],
+                "gbis_eligible_future_message": eligibility.gbis["message"],
+                "eco4_eligible_future": eligibility.eco4["eligible"],
+                "eco4_eligible_future_message": eligibility.eco4["message"],
+                # Property components
+                "roof": eligibility.roof["clean_description"],
+                "walls": eligibility.walls["clean_description"],
+                "heating": eligibility.epc["mainheat-description"],
+                "tenure": eligibility.tenure,
+                "date_epc": eligibility.epc["lodgement-date"],
+            }
+        )
+
+    scoring_df = pd.DataFrame(scoring_data)
+    # Implement the same process that is being used in the recommendation engine to cleaning scoring_df
+
+    # Perform the same cleaning as in the model - first clean number of room variables though
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
+        colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+    )
+
+    scoring_df = DataProcessor.apply_averages_cleaning(
+        data_to_clean=scoring_df,
+        cleaning_data=cleaning_data,
+        cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
+    ).drop(columns=["LOCAL_AUTHORITY"])
+
+    scoring_df = DataProcessor.clean_missings_after_description_process(
+        scoring_df,
+        ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
+            "insulation_thickness" in c) or ("ENERGY_EFF" in c)]
+    )
+
+    scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
+
+    model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
+    all_predictions = model_api.predict_all(
+        df=scoring_df,
+        bucket="retrofit-data-dev",
+        prediction_buckets={
+            "sap_change_predictions": "retrofit-sap-predictions-dev",
+            "heat_demand_predictions": "retrofit-heat-predictions-dev",
+            "carbon_change_predictions": "retrofit-carbon-predictions-dev"
+        }
+    )
+
+    predictions = all_predictions["sap_change_predictions"].copy()
+
+    results_df = pd.DataFrame(results)
+
+    predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
+        results_df[["row_id", "sap"]], how="left", on="row_id"
+    )
+    predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
+    predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
+
+    results_df = results_df.merge(
+        predictions[["sap_uplift", "row_id"]],
+        how="left",
+        on="row_id"
+    )
+
+    results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
+
+    eligibility_assessment = []
+    for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
+        # The upgrade requirements are dependent on the current SAP
+
+        # If the property is an F or G, it only needs to upgrade to an %
+        if row["sap"] <= 38:
+            if row["post_install_sap"] >= 57:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 55:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 53:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+        else:
+
+            if row["post_install_sap"] >= 71:
+                eligibility_classification = "highest confidence"
+            elif row["post_install_sap"] >= 69:
+                eligibility_classification = "high confidence"
+            elif row["post_install_sap"] >= 67:
+                eligibility_classification = "medium confidence"
+            else:
+                eligibility_classification = "unlikely"
+
+        eligibility_assessment.append(
+            {
+                "row_id": row["row_id"],
+                "eligibility_classification": eligibility_classification
+            }
+        )
+
+    eligibility_assessment = pd.DataFrame(eligibility_assessment)
+
+    results_df = results_df.merge(
+        eligibility_assessment, how="left", on="row_id"
+    )
+
+    return results_df, scoring_data, nodata
+
+
+def analyse_ha_7(results_df, data):
+    df = results_df.merge(
+        data[["row_id", "row_code", "Property Type"]], how="left", on="row_id"
+    )
+    warmfront_identification = df["row_code"].value_counts()
+    warmfront_identified = df[df["row_code"] == "potential ECO4"]
+
+    property_types = df["Property Type"].value_counts()
+
+    n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum()
+
+    eco_identified = results_df[results_df["eco4_eligible"]]
+    n_eco4 = eco_identified["eco4_eligible"].sum()
+    gbis_identified = results_df[~results_df["eco4_eligible"] & results_df["gbis_eligible"]]
+    n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum()
+
+    eco_eligibile = results_df[results_df["eco4_eligible"]]
+    eco_eligibile["eligibility_classification"].value_counts()
+
+    future_possibilities_eco = results_df[
+        (results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
+        ].copy()
+
+    future_possibilities_gbis = results_df[
+        (results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & (
+            ~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
+        ].copy()
+
+    total_future_possibilities = future_possibilities_eco.shape[0] + future_possibilities_gbis.shape[0]
+
+
+def app():
+    data = load_data()
+    data["row_id"] = ["ha7" + str(i) for i in range(0, len(data))]
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    cleaning_data = read_parquet_from_s3(
+        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
+    )
+
+    created_at = datetime.now().isoformat()
+
+    results_df, scoring_data, nodata = get_ha7_data(data, cleaned, cleaning_data, created_at)
+
+    # Pickle results
+    # import pickle
+    # with open("ha7_results.pkl", "wb") as f:
+    #     pickle.dump({"results_df": results_df, "scoring_data": scoring_data, "nodata": nodata}, f)