Model/etl/eligibility/ha_15_32/ha_analysis_batch_3.py

import os
import re
import openpyxl
from fuzzywuzzy import fuzz
from pathlib import Path
import msgpack
from datetime import datetime
import pandas as pd
import numpy as np
from utils.s3 import (
    read_from_s3, read_dataframe_from_s3_parquet, save_pickle_to_s3, read_pickle_from_s3, save_dataframe_to_s3_parquet
)
from utils.logger import setup_logger
from dotenv import load_dotenv
from tqdm import tqdm
from backend.SearchEpc import SearchEpc
from etl.eligibility.Eligibility import Eligibility
from etl.eligibility.ha_15_32.app import prepare_model_data_row
from backend.ml_models.api import ModelApi
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from recommendations.recommendation_utils import calculate_cavity_age
from etl.epc.Record import EPCRecord
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc.DataProcessor import EPCDataProcessor
from datetime import datetime

import inspect

src_file_path = inspect.getfile(lambda: None)

EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
ENV_FILE = Path(src_file_path).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
DATA_FOLDER = Path(src_file_path).parent / "local_data" / "ha_data"

logger = setup_logger()
load_dotenv(ENV_FILE)

PROPERTY_TYPE_LOOKUP = {
    "HA1": {
        "built_form": {
            'Mid Terrace': 'Mid-Terrace',
            'Semi-Detached': 'Semi-Detached',
            'End Terrace': 'End-Terrace',
            'Detached': 'Detached',
            'Enclosed Mid': 'Mid-Terrace',
            'Detached Local Connect': 'Detached',
        }
    },
    "HA2": {
        'HOUSE': 'House',
        'FLAT': 'Flat',
        'SHELTERED': None,
        'BUNGALOW': 'Bungalow',
        'BED-SIT': None,
        'MAISONETTE': "Maisonette",
        'HOSTEL': None
    },
    "HA5": {
        "House": "House",
        "Flat": "Flat",
        "Bungalow": "Bungalow",
        "Bedsit": None
    },
    "HA6": {
        "property_type": {
            'HOUSE': "House",
            'GROUND FLOOR FLAT': "Flat",
            'UPPER FLOOR FLAT': "Flat",
            'MAISONETTE': "Maisonette",
            'BUNGALOW': "Bungalow",
            'WARDEN BUNGALOW': "Bungalow",
            'WARDEN FLAT': "Flat",
            'EXTRACARE SCHEME': "Flat",
        }
    },
    "HA7": {
        "property_type": {
            "House": "House",
            "Flat": "Flat",
            "Bungalow": "Bungalow",
            "Maisonette": "Maisonette",
        },
        "built_form": {
            "Semi Detached": "Semi-Detached",
            "Mid Terrace": "Mid-Terrace",
            "End Terrace": "End-Terrace",
            "Detached": "Detached",
            "End Terraced": "End-Terrace",
        }
    },
    "HA8": {
        "House": "House",
        "Flat": "Flat",
        "Bungalow": "Bungalow",
        "Maisonette": "Maisonette",
        "Bedsit": None,
        "Room": None,
        "Other": None,
        "Commerical": None
    },
    "HA11": {
        "Flat": "Flat",
        "House": "House",
        "Semi-Det House": "House",
        "Bedsit": None,
        "End-Terr House": "House",
        "Mid-Terr House": "House",
        "Bungalow": "Bungalow",
        "Maisonette": "Maisonette",
        "End Terr Flat": "Flat",
        "Mid Terr Flat": "Flat",
        "Detached Flat": "Flat",
    },
    "HA12": {
        "House": "House",
        "Flat": "Flat",
        "Bungalow": "Bungalow",
        "Maisonette": "Maisonette",
        "Bedsit": None,
    },
    "HA13": {
        'House': "House",
        'Flat': "Flat",
        'House MT': "House",
        'House SD': "House",
        'House ET': "House",
        'Bungalow MT': "Bungalow",
        'Bungalow ET': "Bungalow",
        'ii': None,
    },
    "HA14": {
        "property_type": {
            "House": "House",
            "Flat": "Flat",
            "Bungalow": "Bungalow",
            "Maisonette": "Maisonette",
        }
    },
    "HA15": {
        'House': 'House',
        'Flat': 'Flat',
        'Bungalow': 'Bungalow',
        'Maisonette': 'Maisonette',
        'Flat over garage': 'Flat',
    },
    "HA16": {
        'Semi Detached Bungalow': {"property-type": "Bungalow", "built-form": "Semi-Detached"},
        'Mid Terraced House': {"property-type": "House", "built-form": "Mid-Terrace"},
        'End Terraced House': {"property-type": "House", "built-form": "End-Terrace"},
        'Low Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"},
        'Semi-Detached House': {"property-type": "House", "built-form": "Semi-Detached"},
        'Detached Bungalow': {"property-type": "Bungalow", "built-form": "Detached"},
        'End Terraced Bungalow': {"property-type": "Bungalow", "built-form": "End-Terrace"},
        'Mid Terraced Bungalow': {"property-type": "Bungalow", "built-form": "Mid-Terrace"},
        'Medium Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"},
        'Detached House': {"property-type": "House", "built-form": "Detached"},
        'Cottage Flat': {"property-type": "Flat", "built-form": "Semi-Detached"},
        'Maisonette Medium Rise': {"property-type": "Flat", "built-form": "Mid-Terrace"},
        'Maisonette Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"},
        'End Terraced Town House': {"property-type": "House", "built-form": "End-Terrace"},
        'Flat Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"},
        'Mid Terraced Town House': {"property-type": "House", "built-form": "Mid-Terrace"},
    },
    "HA18": {
        "House": "House",
        "Flat": "Flat",
        "Bungalow": "Bungalow",
        "Maisonette": "Maisonette",
        "Bedsit": None,
        "Shop": None,
        "Hostel": None,
        "Block": None,
    },
    "HA20": {
        "House": "House",
        "Flat": "Flat",
        'Sheltered Flat': "Flat",
        'Maisonette': 'Maisonette',
        'Bungalow': 'Bungalow',
        'House. SD': 'House',
        'House. MT': 'House',
        'House. ET': 'House',
        'Sheltered Bungalow': 'Bungalow',
        'Guest Accomodation': None,
        'Sheltered House': 'House',
        'House. MT ': 'House',
        'House. D': 'House'
    },
    "HA24": {
        '01 HOUSE': 'House',
        '02 FLAT': 'Flat',
        '03 BUNGALOW': 'Bungalow',
        '10 PBUNGALOW': 'Bungalow',
        '01 HOUSE MID': 'House',
        '13 SBUNGALOW': 'Bungalow',
        '12 SBEDSIT': None,  # BEDSIT does not match the specified property types
        '14 SFLAT': 'Flat',
        '05 BEDSIT': None,
        '04 MAISONETTE': 'Maisonette',
        '11 PFLAT': 'Flat',
        '09 PBEDSIT': None
    },
    "HA25": {
        'Flat': 'Flat',
        'Mid Terrace House': 'House',
        'Semi Detached House': 'House',
        'End Terrace House': 'House',
        'House': 'House',
        'Semi Detached Bung': 'Bungalow',
        'Bungalow': 'Bungalow',
        'End Terrace Bungalow': 'Bungalow',
        'Maisonnette': 'Maisonette',
        'Mid Terrace Bungalow': 'Bungalow',
        'Bedspace': None,
        'Detached House': 'House',
        'Bedsit': 'Flat',
        'Coach House': 'House',
        'Detached Bungalow': 'Bungalow',
        'Office Buildings': None,
        'Guest Room': None,
        'Mid Terrace Housekeeping ': 'House',
        'End Terrace Housex': 'House'
    },
    "HA28": {
        'Flat': 'Flat',
        'Semi detached house': 'House',
        'Terraced house': 'House',
        'Maisonette flat': 'Maisonette',
        'Sheltered bedsit': None,
        'APD flat': 'Flat',
        'Bungalow terraced': 'Bungalow',
        'Flat with partition': 'Flat',
        'Bungalow semi detached': 'Bungalow',
        'APD Bungalow': 'Bungalow',
        'Sheltered flat': 'Flat',
        'Bedsit Flat': 'Flat',
        'Bedsit bungalow semi detached': 'Bungalow',
        'Sheltered bungalow terraced': 'Bungalow',
        'Sheltered bedsit disabled': None,
        'Bedsit bungalow terraced': 'Bungalow',
        'Sheltered bungalow semi detached': 'Bungalow',
        'Sheltered warden flat': 'Flat',
        'Bungalow detached': 'Bungalow',
        'Block': None,  # Does not match the specified property types
        'End Terraced House': 'House',
        'Mid Terraced House': 'House',
        '#N/A': None,  # Assuming this is an invalid or missing entry
        0: None  # Assuming 0 is also an invalid or missing entry
    },
    "HA30": {
        'House': 'House',
        'Flat': 'Flat',
        'Bungalow': 'Bungalow',
        'House with Attached Garage': 'House',
        'Bed Space': None,  # Assuming this does not fit the specified property types
        'House with Garage': 'House',
        'Bungalow with Wheelchair Access': 'Bungalow',
        'Maisonette': 'Maisonette',
        'Flat with Wheelchair Access': 'Flat',
        'Bedsit': None,  # Assuming this does not fit the specified property types
        'Flat w Wheelchair Access & Car Park': 'Flat',
        'House with Wheelchair Access': 'House',
        'Bungalow w Wheelchair Access & Car ': 'Bungalow'
    },
    "HA32": {
        'Bungalow': 'Bungalow',
        'Flat': 'Flat',
        'Bungalow Disabled': 'Bungalow',  # "Disabled" properties categorized with their base type
        'House': 'House',
        'Dormer Bungalow': 'Bungalow',
        'Pop-In': None,  # Does not fit the specified property types
        'Flat Disabled': 'Flat',
        'Laundry': None,  # Does not fit the specified property types
        'Bedsit': None,  # Excluded from the given categories
        'Shed': None,  # Does not fit the specified property types
        'Store Room': None  # Does not fit the specified property types
    },
    "HA34": {
        'Flat': 'Flat',
        'House': 'House',
        'Bungalow': 'Bungalow',
        'Maisonette': 'Maisonette',
        'ND': None,
    },
    "HA35": {
        "Flat": "Flat",
        "Maisonette": "Maisonette",
        "House": "House",
        "Bedsit": None,
        "2 Bedroom Unknown": None,
        "1 Bedroom Unknown": None,
        "3 Bedroom Unknown": None,
        "4 Bedroom Unknown": None,
    },
    "HA37": {
        "FLT": "Flat",
        "HSE": "House",
        "BNW": "Bungalow",
        "MAS": "Maisonette",
        "HSL": None
    },
    "HA39": {
        "Semi house": {"property_type": "House", "built_form": "Semi-Detached"},
        "1st floor flat": {"property_type": "Flat", "built_form": None},
        "Mid terrace house": {"property_type": "House", "built_form": "Mid-Terrace"},
        "Ground floor flat": {"property_type": "Flat", "built_form": None},
        "End terrace house": {"property_type": "House", "built_form": "End-Terrace"},
        "Semi bungalow": {"property_type": "Bungalow", "built_form": "Semi-Detached"},
        "End terrace bungalow": {"property_type": "Bungalow", "built_form": "End-Terrace"},
        "2nd floor flat": {"property_type": "Flat", "built_form": None},
        "Mid terrace bungalow": {"property_type": "Bungalow", "built_form": "Mid-Terrace"},
        "3rd floor flat": {"property_type": "Flat", "built_form": None},
        "Detached bungalow": {"property_type": "Bungalow", "built_form": "Detached"},
        "Maisonette": {"property_type": "Maisonette", "built_form": None},
        "Detached house": {"property_type": "House", "built_form": "Detached"},
        "Lower ground floor flat": {"property_type": "Flat", "built_form": None},
        "Dormer bungalow": {"property_type": "Bungalow", "built_form": None},
        "Basement flat": {"property_type": "Flat", "built_form": None},
        "Cluster House": {"property_type": "House", "built_form": "Detached"},
        "2nd/3rd floor duplex flat": {"property_type": "Flat", "built_form": None},
        "Ground floor flat with study": {"property_type": "Flat", "built_form": None},
        "4th floor flat": {"property_type": "Flat", "built_form": None},
        "1st floor flat with study room": {"property_type": "Flat", "built_form": None},
        "2nd floor flat with study": {"property_type": "Flat", "built_form": None},
    },
    "HA41": {
        'Garage': None,
        'House 1919-1945': 'House',
        'House 1946-1964': 'House',
        'Flats & Maisonettes post 1974': 'Flat',
        'Non traditional houses': 'House',
        'Sheltered': None,
        'Flats & Maisonettes 1965-1974': 'Flat',
        'House post 1974': 'House',
        'Block': None,
        'Flats & Maisonettes 1946-1964': 'Flat',
        'House 1965-1974': 'House',
        'Non traditional flats': 'Flat',
        'Bungalow 1965-1974': 'Bungalow',
        'PIMSS EMPTY': None,
        'Bungalow post 1974': 'Bungalow',
        'Bungalow 1946-1964': 'Bungalow',
        'Flats & Maisonettes 1919-1945': 'Flat',
        'House pre 1919': 'House',
        'Flats & Maisonettes pre 1919': 'Flat',
        'Bungalow 1919-1945': 'Bungalow',
        'Office': None
    },
    "HA42": {
        'Flat': 'Flat',
        'House': 'House',
        'Flat Basement': 'Flat',
        'Room': None,
        'Bedsit Flat': 'Flat',
        'Maisonette': 'Maisonette',
        'Scheme Office': None,
        'Scheme Lounge': None,
        'Bungalow': 'Bungalow',
        'Garage': None,
        'Scheme Sleep Room': None,
        'Cluster': None,
        'Scheme Room': None
    },
    "HA45": {
        'Large block of flats': 'Flat',
        'Small block of flats/dwelling converted in to flats': 'Flat',
        'Semi-detached house': 'House',
        'Mid-terraced house': 'House',
        'End-terraced house': 'House',
        'Block of flats': 'Flat',
        'Detached house': 'House',
        'Flat in mixed use building': 'Flat',
    },
    "HA48": {
        "House": "House",
        "Flat": "Flat",
        "Bungalow": "Bungalow",
        "Maisonette": "Maisonette",
        "Unit": None
    },
    "HA50": {
        'House': 'House',
        'Bungalow': 'Bungalow',
        'Flat': 'Flat',
        'House SD': 'House',
        'House MT': 'House',
        'House ET': 'House',
        'Bungalow ET': 'Bungalow',
        'House SD ': 'House',
        'House. SD': 'House',
        'Bungalow SD': 'Bungalow',
        'Bungalow MT': 'Bungalow',
        'Bungalow D': 'Bungalow',
        'House D': 'House',
        'House. MT': 'House',
        'House ': 'House',
        'House ET ': 'House',
        ' ': None,
        'Flat?': 'Flat',
        'Bungalow ': 'Bungalow'
    },
    "HA51": {
        'FLAT': 'Flat',
        'HOUSE': 'House',
        'MAISONETTE': 'Maisonette',
        'BEDSIT': None,  # Considering as a non-specific residential category here
        'BUNGALOW': 'Bungalow',
    },
    "HA52": {
        'House - Mid Terrace': 'House',
        'Flat - First Floor': 'Flat',
        'Flat - Ground Floor': 'Flat',
        'House - Semi-Detached': 'House',
        'House - End Terrace': 'House',
        'Flat - Second Floor': 'Flat',
        'Bedsit': None,  # Considering as a non-specific residential category here
        'Bungalow - Semi-Detached': 'Bungalow',
        'Bungalow - Mid Terrace': 'Bungalow',
        'Bungalow - End Terrace': 'Bungalow',
        'House - Detached': 'House',
        'Flat - Third Floor': 'Flat',
        'House attached to flats': 'House',
        'Flat - Fourth Floor': 'Flat',
        'Bungalow - Detached': 'Bungalow'
    },
    "HA56": {
        'House Non Specific': 'House',
        'HOUSE TERRACED': 'House',
        'HOUSE - SEMI DETACHD': 'House',
        'Bungalow': 'Bungalow',
        'House - End Terraced': 'House',
        'Block': None,
        'Block with Communal': None,
        'Bungalow - Terraced': 'Bungalow',
        'Bungalow - Semi Dtch': 'Bungalow',
        'Block House with rooms': None,
        'Bungalow - End Terr': 'Bungalow',
        'House - Mid Terraced': 'House',
        'Bungalow - Detached': 'Bungalow',
        'House - Detached': 'House',
        'HOUSE THREE STOREY': 'House',
        'Maisonette': 'Maisonette',
        'Communal Block': None,
        'Scheme': None
    },
    "HA63": {
        'Flat': 'Flat',
        'House - Semi detached': 'House',
        'House - Detached': 'House',
        'House - End Terrace': 'House',
        'House - Mid Terrace': 'House',
        'Bungalow - Semi detached': 'Bungalow',
        'Bungalow': 'Bungalow',
        'Bedsit': None,  # Considering as a non-specific residential category here
        'Maisonette': 'Maisonette',
        'Bungalow - End Terrace': 'Bungalow',
        'Bungalow - Detached': 'Bungalow',
        'Maisonette - Mid Terrace': 'Maisonette',
        'Maisonette - End Terrace': 'Maisonette',
        'Studio Flat': 'Flat',
        'Maisonette - Detached': 'Maisonette',
        'Bungalow - Mid Terrace': 'Bungalow',
        'Bedsit - Mid Terrace': None,
        'Bedsit - End Terrace': None,
        'Amenity Block - Semi detached': None,  # Assuming non-residential
        'Maisonette - Semi Detached': 'Maisonette',
        'Amenity Block - Detached': None,  # Assuming non-residential
        'Hostel': None,  # Typically not considered a standard residential property for this context
        'Bungalow - Attached': 'Bungalow',
        'Unknown': None,  # Not enough information to categorize
        'Studio Flat - Mid Terrace': 'Flat',
        'Chalet - Wheelchair': None  # Specialized type, not categorized here
    },
    "HA107": {
        "property_type": {
            "HOUSE": "House",
            "BUNGALOW": "Bungalow",
            "GRD FLOOR FLAT": "Flat",
            "FIRST FLOOR FLAT": "Flat",
            "SHELTERED BUNGALOW": "Bungalow",
            "MAISONETTE": "Maisonette",
            "SECOND FLOOR FLAT": "Flat",
            "SHELTERED FIRST FLR": "Flat",
            "SHELTERED GROUND FLR": "Flat",
            "GRD FLOOR BED SIT": "House"
        },
        "built_form": {
            "Semi Detached": "Semi-Detached",
            "Mid Terrace": "Mid-Terrace",
            "End Terrace": "End-Terrace",
            "Detached": "Detached",
            "Detatched": "Detached",
        }
    },
    "HA117": {
        "Flat": "Flat",
        "House": "House",
        "Bungalow": "Bungalow",
        "Flat over garage/underpass": "Flat",
    },
    "HAXXX": {
        'mid terraced house': 'House',
        'semi detached house': 'House',
        '1st fl 4 in a block': 'Flat',
        'G/F 4 in a block': 'Flat',
        'end terraced house': 'House',
        '1st floor flat': 'Flat',
        'G/F floor flat': 'Flat',
        'semi detached bungalow': 'Bungalow',
        '2nd floor flat': 'Flat',
        'mid terrace bungalow': 'Bungalow',
        'detached bungalow': 'Bungalow',
        'end terrace bungalow': 'Bungalow',
        'Staff accommodation': None  # Marked as None due to its special nature
    }
}


class DataLoader:
    COLUMN_CONFIG = {
        "HA1": {
            "address": "Address",
            "postcode": "Address - Postcode"
        },
        "HA5": {
            "address": "Address",
            "postcode": "matching_postcode"
        },
        "HA6": {
            "address": "propertyaddress",
            "postcode": "address"  # The 'address' column actually contains postcode
        },
        "HA12": {
            "address": "Full Address",
            "postcode": "Postcode"
        },
        "HA16": {
            "address": "Address",
            "postcode": "Postcode"
        },
        "HA24": {
            "address": "Address",
            "postcode": "Postcode"
        },
        "HA25": {
            "address": "T1_Address",
            "postcode": "matching_postcode"
        },
        "HA30": {
            "address": "A_Address",
            "postcode": "A_Postcode"
        },
        "HA31": {
            "address": "A_Address",
            "postcode": "matching_postcode"
        },
        "HA45": {
            "address": "Full postal address",
            "postcode": "Postcode"
        },
        "HA48": {
            "address": "Full Address",
            "postcode": "Postcode"
        },
        "HA49": {
            "address": "Property Address Full",
            "postcode": "Property Postcode"
        },
        "HA52": {
            "address": "Postal Address",
            "postcode": "POSTCODE"
        },
        "HA54": {
            "address": "Postal Address",
            "postcode": "matching_postcode"
        }
    }

    UNMATCHED_CIGA = {
        "HA2": 0,
        "HA6": 117,
        "HA9": 0,
        "HA12": 6,
        "HA13": 119,
        "HA14": 3,
        "HA15": 3,
        "HA16": 7,
        "HA24": 12,
        "HA50": 4,
        "HA63": 15,
        "HA107": 51,
        "HA48": 0,
        "HA45": 0,
        "HA52": 5,
        "HA20": 6
    }

    UNMATCHED_ECO3 = {
        "HA25": 154,
        "HA41": 26,
        "HA50": 5,
        "HA56": 320,
        "HA63": 0,
        "HA117": 4,
        "HA51": 24
    }

    def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
        self.directories = directories
        self.use_cache = use_cache
        self.december_figures_filepath = december_figures_filepath
        self.rebuild = rebuild

        self.data = {}
        self.december_figures = None
        self.facts_and_figures = None

    def create_asset_list_matching_address(self, ha_name, asset_list):

        if ha_name in [
            "HA1", "HA5", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA52", "HA54"
        ]:
            asset_list["matching_address"] = asset_list[
                self.COLUMN_CONFIG[ha_name]["address"]
            ].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list[
                self.COLUMN_CONFIG[ha_name]["postcode"]
            ].astype(str).str.lower().str.strip()
        elif ha_name == "HA2":
            # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
            asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA7":
            # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
            asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA8":
            asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA9":
            asset_list["matching_address"] = asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA11":
            asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Post Code"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
        elif ha_name == "HA13":
            asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["address 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA14":
            # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
            asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 4"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA15":
            asset_list["matching_address"] = (
                asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA18":
            asset_list["matching_address"] = (
                asset_list["Address"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Post Code"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
        elif ha_name == "HA19":
            asset_list["matching_address"] = (
                asset_list["Address1"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address2"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address3"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA20":
            asset_list["matching_address"] = (
                asset_list["House Name"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Block"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA21":
            asset_list["matching_address"] = (
                asset_list["Address"].astype(str).str.lower().str.strip() + ", " +
                asset_list["PostCode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA25":
            asset_list["matching_address"] = asset_list[
                self.COLUMN_CONFIG[ha_name]["address"]
            ].astype(str).str.lower().str.strip()

            asset_list["matching_postcode"] = asset_list['matching_address'].apply(
                lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x
            )
        elif ha_name == "HA27":
            asset_list["matching_address"] = (
                asset_list[" Address"].astype(str).str.lower().str.strip() + ", " +
                asset_list[" Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA28":
            asset_list["matching_address"] = (
                asset_list["House Number"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Street 1"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA32":
            asset_list["matching_address"] = (
                asset_list["Dwelling num"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Street"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA33":
            asset_list["matching_address"] = (
                asset_list["ADDRESS"].astype(str).str.lower().str.strip() + ", " +
                asset_list["POST CODE"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["POST CODE"].astype(str).str.lower().str.strip()
        elif ha_name == "HA34":
            asset_list["matching_address"] = (
                asset_list[" Address"].astype(str).str.lower().str.strip() + ", " +
                asset_list[" Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA35":
            asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Post Code"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Address Post Code"].astype(str).str.lower().str.strip()
        elif ha_name == "HA37":
            asset_list["matching_address"] = asset_list["ADDRESS LINE 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["ADDRESS LINE 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["ADDRESS LINE 3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["POSTCODE"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
        elif ha_name == "HA38":
            asset_list["matching_address"] = asset_list["House_Number"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address_Line_1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address_Line_2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address_Line_3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA39":
            # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
            asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["add_2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["add_3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["post_code"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["post_code"].astype(str).str.lower().str.strip()
        elif ha_name == "HA41":
            asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["AddressLine3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["AddressLine4"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["AddressLine5"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA42":
            asset_list["matching_address"] = asset_list["Dwelling Number"].astype(str).str.lower().str.strip() + " " + \
                                             asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Locality"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA44":
            asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postal Code"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postal Code"].astype(str).str.lower().str.strip()
        elif ha_name == "HA50":
            asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Post Code"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
        elif ha_name == "HA51":
            asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_address"] = np.where(
                asset_list["Block"].str.strip().str.len() > 0,
                asset_list["Block"].astype(str).str.lower().str.strip() + ", " + \
                asset_list["matching_address"],
                asset_list["matching_address"]
            )
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA56":
            asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Post Code"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
        elif ha_name == "HA63":
            asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["POSTCODE"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
        elif ha_name == "HA70":
            asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["POSTCODE"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
        elif ha_name == "HA107":
            # Create matching_address by concatenating House No, Street, Town, District, Postcode
            asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["District"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Postcode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        elif ha_name == "HA117":
            asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["PostCode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
        elif ha_name == "HAXX":
            asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["PostCode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
        elif ha_name == "HAXXX":
            asset_list["matching_address"] = (
                asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " +
                asset_list["Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        else:
            raise NotImplementedError("implement me")

        return asset_list

    @staticmethod
    def extract_property_info_ha107(properties):
        property_types = {
            "House": "House",
            "Flat": "Flat",
            "Bungalow": "Bungalow",
            "Maisonette": "Maisonette",
            "Bedsit": None
        }

        built_forms = {
            "Detached": "Detached",
            "Semi Detached": "Semi-Detached",
            "End Terrace": "End-Terrace",
            "Mid Terrace": "Mid-Terrace"
        }

        # Function to extract property type and built form from a description
        def extract_from_description(description):
            property_type = None
            built_form = None

            for key in property_types:
                if key in description:
                    property_type = property_types[key]
                    break

            for key in built_forms:
                if key in description:
                    built_form = built_forms[key]
                    break

            return property_type, built_form

        # Process each property in the list
        results = []
        for property_description in properties:
            property_type, built_form = extract_from_description(property_description)
            results.append(
                {
                    "Property type": property_description,
                    "property_type": property_type,
                    "built_form": built_form
                }
            )
        results = pd.DataFrame(results)

        return results

    def append_asset_list_built_form(self, ha_name, asset_list):

        # Finally, we process property_type or built form, where needed
        if ha_name == "HA6":
            asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6)

        if ha_name == "HA107":
            mapped_df = self.extract_property_info_ha107(asset_list["Property type"].unique())
            asset_list = asset_list.merge(
                mapped_df, how="left", on="Property type"
            )

        return asset_list

    @staticmethod
    def create_asset_list_house_no(ha_name, asset_list):
        """
        This function will append the House number onto the asset list
        :return:
        """

        if ha_name == "HA107":
            asset_list["HouseNo"] = asset_list["House No"].copy()
        elif ha_name == "HA32":
            asset_list["HouseNo"] = asset_list["Dwelling num"].copy()
        elif ha_name == "HA28":
            asset_list["HouseNo"] = asset_list["House Number"].copy()
        elif ha_name == "HA38":
            asset_list["HouseNo"] = asset_list["House_Number"].copy()
        elif ha_name == "HA9":
            asset_list["HouseNo"] = asset_list["House Number"].copy()
        elif ha_name == "HAXXX":
            asset_list["HouseNo"] = asset_list["Door Number"].copy()
        else:
            split_addresses = asset_list['matching_address'].str.split(',', expand=True)
            house_numbers = split_addresses[0].str.split(' ', expand=True)
            # If we have "flat" or valley" as the house number, then the house number is actually in the second column
            house_numbers[0] = np.where(house_numbers[0].isin(["flat", "valley"]), house_numbers[1], house_numbers[0])

            # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
            # many columns there might be
            house_numbers = house_numbers.iloc[:, 0:1]
            house_numbers.columns = ['HouseNo']

            # Remove trailing punctuation such as , or ;
            house_numbers["HouseNo"] = house_numbers["HouseNo"].str.rstrip(',;')

            asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)

        return asset_list

    @staticmethod
    def create_ciga_list_house_no(ciga_list):
        """
        This function will append the House number onto the asset list
        :return:
        """

        split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
        house_numbers = split_addresses[0].str.split(' ', expand=True)
        # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
        # many columns there might be
        house_numbers = house_numbers.iloc[:, 0:1]
        house_numbers.columns = ['HouseNo']

        ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1)

        return ciga_list

    @staticmethod
    def dedupe_ciga_list(ciga_list):
        ciga_list["unique_key"] = ciga_list["Matched Address"] + ciga_list["Matched Postcode"]
        # Remove spaces from the unique key
        ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(" ", "")
        # Remove punctuation from the unique key
        ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(r'[^\w\s]', '')
        # Drop duplicated keys
        ciga_list = ciga_list[~ciga_list["unique_key"].duplicated()]
        return ciga_list

    @staticmethod
    def get_asset_sheetname(workbook):
        if "Asset List" in workbook.sheetnames:
            return "Asset List"
        elif "Asset list" in workbook.sheetnames:
            return "Asset list"
        elif "Asset" in workbook.sheetnames and "Assets" not in workbook.sheetnames:
            return "Asset"
        elif "Decent Homes Stock" in workbook.sheetnames:
            return "Decent Homes Stock"
        elif "Report" in workbook.sheetnames:
            return "Report"
        else:
            return "Assets"

    @staticmethod
    def get_ciga_sheetname(workbook):

        if "CIGA Checks" in workbook.sheetnames:
            return "CIGA Checks"
        elif "CIGA checks" in workbook.sheetnames:
            return "CIGA checks"
        elif "CIGA check" in workbook.sheetnames:
            return "CIGA check"
        elif "CIGA Check" in workbook.sheetnames:
            return "CIGA Check"
        elif "CIGA requested" in workbook.sheetnames:
            return "CIGA requested"
        else:
            return "CIGA"

    @staticmethod
    def get_survey_sheetname(workbook):
        if "ECO Surveys" in workbook.sheetnames:
            return "ECO Surveys"
        elif "ECO Survey" in workbook.sheetnames:
            return "ECO Survey"
        elif "ECO 4 Surveys completed" in workbook.sheetnames:
            return "ECO 4 Surveys completed"
        elif "ECO4 Surveys" in workbook.sheetnames:
            return "ECO4 Surveys"
        else:
            return "ECO surveys"

    @staticmethod
    def correct_ha51_asset_list(asset_list):
        # Correct this
        asset_list["HouseNo"] = np.where(
            asset_list["matching_address"].str.contains("61 wandle bank"),
            asset_list["Block"].str.lower(),
            asset_list["HouseNo"]
        )

        return asset_list

    def prepare_ha17(self, workbook):
        blocks_sheet = workbook["Blocks List - Cavity Wall only"]
        blocks_data = []
        blocks_colnames = [cell.value for cell in blocks_sheet[2]]
        for row in blocks_sheet.iter_rows(min_row=4, values_only=False):
            row_data = [cell.value for cell in row]  # This will get you the cell values
            blocks_data.append(row_data)

        blocks_df = pd.DataFrame(blocks_data, columns=blocks_colnames)

        blocks_df["matching_address"] = (
            blocks_df["Block Name\n[as per Naming Convention procedure]"].astype(str).str.lower().str.strip() + ", " +
            blocks_df["Block Street Name"].astype(str).str.lower().str.strip() + ", " +
            blocks_df["Postcode"].astype(str).str.lower().str.strip()
        )
        blocks_df["matching_postcode"] = blocks_df["Postcode"].astype(str).str.lower().str.strip()
        blocks_df["property_type"] = "Flat"

        street_properties_sheet = workbook["Street Properties - Cavity Wall"]
        street_properties_data = []
        street_properties_colnames = [cell.value for cell in street_properties_sheet[2]]
        for row in street_properties_sheet.iter_rows(min_row=3, values_only=False):
            row_data = [cell.value for cell in row]  # This will get you the cell values
            street_properties_data.append(row_data)

        street_properties_df = pd.DataFrame(street_properties_data, columns=street_properties_colnames)

        street_properties_df["matching_address"] = (
            street_properties_df["Block Name\n[as per Naming Convention procedure]"].astype(
                str).str.lower().str.strip() + ", " +
            street_properties_df["Postcode"].astype(str).str.lower().str.strip()
        )
        street_properties_df["matching_postcode"] = street_properties_df["Postcode"].astype(str).str.lower().str.strip()
        street_properties_df["property_type"] = street_properties_df[
            "Block typology based on dwelling type\n[defined list]"
        ]

        asset_list_compressed = pd.concat(
            [
                blocks_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]],
                street_properties_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]]
            ],
            axis=0
        )
        # We expand
        range_pattern = r"(\d+)\s+to\s+(\d+)\s+(.*)"
        asset_list = []
        for _, row in tqdm(asset_list_compressed.iterrows(), total=len(asset_list_compressed)):
            if row["ECO Eligibility"] == "Not Eligible":
                asset_list.append(row.to_dict())
                continue

            # Detect a house number range
            match = re.search(range_pattern, row["matching_address"])

            if not match:
                asset_list.append(row.to_dict())
                continue

            # Extracting the start and end of the range
            start_number = int(match.group(1))
            end_number = int(match.group(2))
            rest_of_address = match.group(3)

            # Generating the list of house numbers
            house_numbers = list(range(start_number, end_number + 1))
            data_to_extend = []
            for house_number in house_numbers:
                new_adress = f"{house_number} {rest_of_address}"

                entry = row.to_dict().copy()
                entry.update({"matching_address": new_adress})

                data_to_extend.append(entry)

            asset_list.extend(data_to_extend)

        asset_list = pd.DataFrame(asset_list)

        # Add in asset_list_row_id
        asset_list["asset_list_row_id"] = ["HA17" + str(i) for i in range(0, len(asset_list))]

        # Add on house number
        asset_list = self.create_asset_list_house_no(ha_name="HA17", asset_list=asset_list)

        return asset_list

    def load_asset_list(self, filepath, ha_name):
        workbook = openpyxl.load_workbook(filepath)
        if ha_name == "HA17":
            asset_list = self.prepare_ha17(workbook)
            return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
        else:
            asset_sheetname = self.get_asset_sheetname(workbook)

        asset_sheet = workbook[asset_sheetname]
        asset_sheet_colnames = [cell.value for cell in asset_sheet[1]]
        if ha_name == "HA25":
            asset_sheet_colnames[11] = "matching_postcode"

        if ha_name == "HA31":
            asset_sheet_colnames[2] = "matching_postcode"

        if ha_name == "HA54":
            asset_sheet_colnames[10] = "matching_postcode"

        if ha_name == "HA5":
            asset_sheet_colnames[2] = "matching_postcode"

        rows_data = []

        for row in asset_sheet.iter_rows(min_row=2, values_only=False):
            row_data = [cell.value for cell in row]  # This will get you the cell values
            rows_data.append(row_data)

        asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames)

        asset_list = asset_list.loc[:, asset_list.columns.notnull()]

        # Remove entirely empty rows - consider all rows apart from row_color
        asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)]

        # Add in asset_list_row_id
        asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))]

        # Create matching address and matching postcode
        asset_list = self.create_asset_list_matching_address(ha_name=ha_name, asset_list=asset_list)

        asset_list = self.create_asset_list_house_no(ha_name=ha_name, asset_list=asset_list)

        asset_list = self.append_asset_list_built_form(ha_name=ha_name, asset_list=asset_list)

        # We correct the asset list if it needs it
        # Correct the asset list
        correction_function_name = f"correct_{ha_name.lower()}_asset_list"
        if hasattr(self, correction_function_name):
            asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
            asset_list = asset_list_correction_function(asset_list)

        # For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga
        # lists, and so
        # we can return the asset list now
        if ha_name in ["HA1", "HA27"]:
            return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

        # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be
        # suitable under ECO4, since their walls will be filled
        eco3_list = pd.DataFrame()
        sheetnames_lower = [x.lower() for x in workbook.sheetnames]
        eco3_sheetname_index = [i for i, x in enumerate(sheetnames_lower) if "eco3" in x.replace(" ", "")]
        if eco3_sheetname_index:
            eco3_sheetname = workbook.sheetnames[eco3_sheetname_index[0]]
            eco3_sheet = workbook[eco3_sheetname]
            eco3_rows = []
            for row in eco3_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
                row_data = [cell.value for cell in row]  # This will get you the cell values
                eco3_rows.append(row_data)

            eco3_list = pd.DataFrame(eco3_rows, columns=[cell.value for cell in eco3_sheet[1]])
            # Remove columns that are None
            eco3_list = eco3_list.loc[:, eco3_list.columns.notnull()]
            # Remove rows that are completely empty
            eco3_list = eco3_list.loc[eco3_list.loc[:, eco3_list.columns].notnull().any(axis=1)]
            eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))]

            # Perform the eco3 merge
            if not eco3_list.empty:
                eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name)

        if ha_name in ["HA25"]:
            # Accomodate ha25 unique structure
            return asset_list, pd.DataFrame(), pd.DataFrame(), eco3_list

        # We check if there is a survey list
        survey_sheetname = self.get_survey_sheetname(workbook)
        survey_sheet = workbook[survey_sheetname]
        survey_rows = []
        for row in survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
            row_data = [cell.value for cell in row]  # This will get you the cell values
            survey_rows.append(row_data)

        survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
        # Remove columns that are None
        survey_list = survey_list.loc[:, survey_list.columns.notnull()]
        # Remove rows that are completely empty
        survey_list = survey_list.loc[survey_list.loc[:, survey_list.columns].notnull().any(axis=1)]
        survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))]

        # Perform survey list merge
        if not survey_list.empty:
            survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name)

        # We check if there are CIGA checks
        ciga_sheetname = self.get_ciga_sheetname(workbook)
        ciga_sheet = workbook[ciga_sheetname]
        ciga_rows = []
        for row in ciga_sheet.iter_rows(min_row=2, values_only=False):
            row_data = [cell.value for cell in row]  # This will get you the cell values
            ciga_rows.append(row_data)

        ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
        # Remove columns that are None
        ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
        # Remove rows that are completely None
        ciga_list = ciga_list.loc[ciga_list.loc[:, ciga_list.columns].notnull().any(axis=1)]
        # Perform ciga list merge
        if not ciga_list.empty:
            # Remove rows with missing postcode which happens in a small number of cases
            ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
            ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
            ciga_list = self.create_ciga_list_house_no(ciga_list)
            ciga_list = self.dedupe_ciga_list(ciga_list)
            ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)

        return asset_list, survey_list, ciga_list, eco3_list

    @staticmethod
    def correct_ha6_asset_list(asset_list):

        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
        asset_list["matching_address"] = asset_list["matching_address"].str.replace("baggott place", "baggotts place")

        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
        asset_list["matching_address"] = asset_list["matching_address"].str.replace("cherry tree", "cherrytree")

        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
        asset_list["matching_address"] = asset_list["matching_address"].str.replace("maryhill close", "mary hill close")

        asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
        asset_list["matching_address"] = asset_list["matching_address"].str.replace("moffat way", "moffatt way")

        return asset_list

    @staticmethod
    def correct_ha56_asset_list(asset_list):
        # CH1 4JR has already been surveyed, but it's listed in the asset list
        # as a single row, when it's actually 32 units, so we just set this
        # as ineligible
        asset_list["ECO Eligibility"] = np.where(
            asset_list["Post Code"] == "CH1 4JR",
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        # Same for CW8 3EU
        asset_list["ECO Eligibility"] = np.where(
            asset_list["Post Code"] == "CW8 3EU",
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        asset_list["ECO Eligibility"] = np.where(
            asset_list["Post Code"] == "CW1 3HP",
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        asset_list["ECO Eligibility"] = np.where(
            asset_list["Post Code"] == "WA4 2PH",
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        asset_list["ECO Eligibility"] = np.where(
            asset_list["Post Code"] == "BD6 1QJ",
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        asset_list["ECO Eligibility"] = np.where(
            asset_list["Post Code"] == "L39 1RS",
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        asset_list["ECO Eligibility"] = np.where(
            asset_list["Post Code"] == "WA10 2DE",
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        # Already surveyed under ECO4
        asset_list["ECO Eligibility"] = np.where(
            asset_list["Post Code"] == "SK17 6NR",
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        asset_list["ECO Eligibility"] = np.where(
            ((asset_list["Post Code"] == "WA5 0EN") &
             (asset_list["Address 1"] == "Block 17-26 Tavlin Avenue")),
            "Not eligible",
            asset_list["ECO Eligibility"]
        )

        return asset_list

    @staticmethod
    def correct_ha14_asset_list(asset_list):

        # For 5 Queens Court, DE72 3NP, the postcode is actually DE72 3QZ
        asset_list.loc[
            (asset_list["Address 1"] == "5 Queens Court") &
            (asset_list["Postcode"].str.strip() == "DE72 3NP"),
            "matching_postcode"
        ] = "DE72 3QZ"

        # We then correct the matching_address
        asset_list.loc[
            (asset_list["Address 1"] == "5 Queens Court") &
            (asset_list["Postcode"].str.strip() == "DE72 3NP"),
            "matching_address"
        ] = "5 queens court, garfield avenue, draycott, derby, de72 3qz"

        return asset_list

    @staticmethod
    def correct_ha15_asset_list(asset_list):
        asset_list["matching_postcode"] = np.where(
            asset_list["Address Line 1"] == "103 Priory Crescent",
            "hp19 9ny",
            asset_list["matching_postcode"]
        )
        return asset_list

    @staticmethod
    def correct_ha32_asset_list(asset_list):
        asset_list["Postcode"] = np.where(
            (asset_list["Street"] == "Norton Grove") & (asset_list["Postcode"] == "HU4 6HQ") & (
                asset_list["Dwelling num"] == "7"),
            "hu4 6hg",
            asset_list["Postcode"]
        )
        return asset_list

    @staticmethod
    def correct_ha38_asset_list(asset_list):
        # For Kingsford court, the house number is at the end of the address
        def rearrange_address_if_flat(address):
            if '/flat' in address.lower():
                parts = address.split('/flat', 1)
                return f"FLAT{parts[1]}, {parts[0]}"
            return address

        def extract_house_no_if_flat(address):
            if '/flat' in address.lower():
                # Attempt to extract the house number following "/flat"
                try:
                    house_no = address.split('/flat ')[1].split(' ')[0]
                    # Remove trailing comma
                    house_no = house_no.replace(",", "")
                except IndexError:
                    house_no = None
                return house_no
            return None

        asset_list['ExtractedHouseNo'] = asset_list['matching_address'].apply(extract_house_no_if_flat)
        asset_list.loc[asset_list['ExtractedHouseNo'].notnull(), 'HouseNo'] = asset_list['ExtractedHouseNo']
        asset_list['matching_address'] = asset_list['matching_address'].apply(rearrange_address_if_flat)

        # We update a few specific rows
        asset_list["HouseNo"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/ROOM A1",
                    "10 SOUTH VIEW/ROOM A2",
                    "10 SOUTH VIEW/ROOM A3",
                ]
            )),
            "10A",
            asset_list["HouseNo"]
        )

        asset_list["matching_address"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/ROOM A1",
                ]
            )),
            "10a, 10 south view/room a1, spennymoor, co. durham, dl16 7df'",
            asset_list["matching_address"]
        )

        asset_list["HouseNo"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/ROOM B1",
                    "10 SOUTH VIEW/ROOM B2",
                    "10 SOUTH VIEW/ROOM B3",
                    "10 SOUTH VIEW/ROOM B4",
                ]
            )),
            "10B",
            asset_list["HouseNo"]
        )

        asset_list["matching_address"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/ROOM B1",
                ]
            )),
            "10b, 10 south view/room b1, spennymoor, co. durham, dl16 7df",
            asset_list["matching_address"]
        )

        asset_list["HouseNo"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/FLAT C",
                ]
            )),
            "10C",
            asset_list["HouseNo"]
        )

        asset_list["matching_address"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/FLAT C",
                ]
            )),
            "FLAT c, spennymoor, co. durham, dl16 7df, 10c, 10 south view",
            asset_list["matching_address"]
        )

        asset_list["HouseNo"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/FLAT D",
                ]
            )),
            "10D",
            asset_list["HouseNo"]
        )

        asset_list["matching_address"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/FLAT D",
                ]
            )),
            "FLAT d, spennymoor, co. durham, dl16 7df, 10d, 10 south view",
            asset_list["matching_address"]
        )

        asset_list["HouseNo"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/FLAT E",
                ]
            )),
            "10E",
            asset_list["HouseNo"]
        )

        asset_list["matching_address"] = np.where(
            (asset_list["Address_Line_1"].isin(
                [
                    "10 SOUTH VIEW/FLAT E",
                ]
            )),
            'FLAT e, spennymoor, co. durham, dl16 7df, 10e, 10 south view',
            asset_list["matching_address"]
        )

        return asset_list

    @staticmethod
    def correct_ha6_survey_list(survey_list):

        # Correct the survey list
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Seabridge Road", "Seabridge Lane"
        )

        # Strip out /KNUTTON from the street name
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/KNUTTON", "")

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Clevend Road", "Cleveland Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "TURNERS AVENUE", "Turner Avenue"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "WEDGEWWOD AVENUE", "Wedgwood Avenue"
        )
        # The cherrytree record has wrong postcode
        survey_list.loc[survey_list["Street / Block Name"] == "Cherrytree road", "Post Code"] = "ST5 7BP"

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "MONUMENT RD", "Monument Road"
        )

        # Generally replace " RD" with " Road"
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" RD", " Road")

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "HILARY Road", "Hillary Road"
        )

        # Remove full stops from the street name
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(".", "")

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Chatworth road", "Chatsworth Place"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Wood Croft", "Woodcroft"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Milstone Avenue", "Millstone Avenue"
        )

        # Strip out /TALKE from the street name
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/TALKE", "")

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Woodcutts Street", "Woodshutts Street"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "HILLARY AVENUE", "Hillary Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "HILLARY AVENUE", "Hillary Road"
        )

        # Replace " Rd" with " Road"
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" Rd", " Road")

        # We have a record listed as 19, MAPLE AVENUE ST7 1JX, when it should be 19, Hollins Crescent ST7 1JX
        survey_list.loc[
            (survey_list["Street / Block Name"] == "MAPLE AVENUE") &
            (survey_list["NO."].isin([19])) &
            (survey_list["Post Code"] == "ST7 1JX"),
            "Street / Block Name"
        ] = "Hollins Crescent"

        # However, some of the maple avenue records, are indeed Maple avenue, but are listed with the wrong postcode.
        # E.g. number 26
        survey_list.loc[
            (survey_list["Street / Block Name"] == "MAPLE AVENUE") &
            (survey_list["NO."].isin([26])) &
            (survey_list["Post Code"] == "ST7 1JX"),
            "Post Code"
        ] = "ST7 1JW"

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "BURSLEY Road", "Bursley Way"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Brittania Avenue", "Brittain Avenue"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Hawthorn Road", "Hawthorne Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Eastdale Place", "Easdale Place"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Wedgewood Road", "Wedgwood Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Droitwich Drive", "Droitwich Close"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Longdale Road", "Langdale Road"
        )

        # We have 2 addresses in the survey list that don't have postcodes. We'll manually add them in
        survey_list.loc[
            (survey_list["Street / Block Name"] == "Rogers Avenue") &
            pd.isnull(survey_list["Post Code"]),
            "Post Code"
        ] = "ST5 9AT"

        survey_list.loc[
            (survey_list["Street / Block Name"] == "Cedar Road") &
            pd.isnull(survey_list["Post Code"]),
            "Post Code"
        ] = "ST5 7BY"

        # PERFORM ADDITIONAL DROPS
        # Dropping rows based on multiple conditions
        conditions_to_drop = [
            (survey_list['Street / Block Name'] == "Bedford Crescent") & (survey_list['Post Code'] == "ST5 3EH") & (
                survey_list['NO.'] == 23) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
            (survey_list['Street / Block Name'] == "Hereford Avenue") & (survey_list['Post Code'] == "ST5 3EJ") & (
                survey_list['NO.'] == 92) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
            (survey_list['Street / Block Name'] == "Seabridge Lane") & (survey_list['Post Code'] == "ST5 3EX") & (
                survey_list['NO.'].isin([16, 18, 42])) & (
                survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
            (survey_list['Street / Block Name'] == "ESKDALE PLACE") & (survey_list['Post Code'] == "ST5 3QW") & (
                survey_list['NO.'] == 5) & (survey_list['SUBMISSION DATE'].astype(str) == "2023-03-06 00:00:00"),
            (survey_list['Street / Block Name'] == "Birch House road") & (survey_list['Post Code'] == "ST6 2LS") & (
                survey_list['NO.'].isin([56, 58])),
            (survey_list['Street / Block Name'] == "Blackthorn Place") & (survey_list['Post Code'] == "ST6 2LS") & (
                survey_list['NO.'].isin([37, 39])),
            (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 7BT") & (
                survey_list['NO.'].isin([17, 6])),
            (survey_list['Street / Block Name'] == "Lion Grove") & (survey_list['Post Code'] == "ST5 7HQ") & (
                survey_list['NO.'].isin([10, 12])) & (
                survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
            (survey_list['Street / Block Name'] == "DENRY CRESCENT") & (survey_list['Post Code'] == "ST5 8JW") & (
                survey_list['NO.'] == 87) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
            (survey_list['Street / Block Name'] == "HOLLINS CRESCENT") & (survey_list['Post Code'] == "ST7 1JW") & (
                survey_list['NO.'] == 19)
        ]

        # Combine all conditions with an OR "|"
        combined_condition = np.logical_or.reduce(conditions_to_drop)

        # Drop rows that meet the combined condition
        survey_list = survey_list[~combined_condition]

        # Making replacements using np.where
        survey_list['Post Code'] = np.where(
            (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3EH") & (
                survey_list['NO.'] == 17),
            "ST5 7BT",
            survey_list['Post Code']
        )

        survey_list['Post Code'] = np.where(
            (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3ED") & (
                survey_list['NO.'] == 6),
            "ST5 7BT",
            survey_list['Post Code']
        )

        # Maple avenue (stoke on trent, not newcastle) should be st7 1jw
        survey_list["Post Code"] = np.where(
            (survey_list["Street / Block Name"].str.lower().str.contains("maple avenue")) & (
                survey_list["Post Code"].str.lower() == "st7 1jx"
            ),
            "st7 1jw",
            survey_list["Post Code"]
        )

        # Hollins Crescent should be st7 1jx
        survey_list["Post Code"] = np.where(
            (survey_list["Street / Block Name"].str.lower().str.contains("hollins crescent")) & (
                survey_list["Post Code"].str.lower() == "st7 1jw"
            ),
            "st7 1jx",
            survey_list["Post Code"]
        )

        # Additional drops as the above misses some:
        survey_list = survey_list[
            ~((survey_list["NO."].astype(str).isin(["18", "42"])) &
              (survey_list["Street / Block Name"] == "Seabridge Lane") &
              (survey_list["Post Code"] == "ST5 3EY") &
              (survey_list["SUBMISSION DATE"].astype(str) == "24.07.2023") &
              (survey_list["INSTALLED OR CANCELLED"].str.contains("NO UPDATE YET")))
        ]

        return survey_list

    @staticmethod
    def correct_ha14_survey_list(survey_list):
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Godfrey Road", "Godfrey Drive"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Oiliver Road", "Oliver Road"
        )

        # For postodes DE7 4FB, DE7 4EZ, it's actually spelled WINDERMERE AVENUE, not WINDEREMERE AVENUE (without the
        # extra e)
        survey_list.loc[
            (survey_list["Street / Block Name"] == "WINDEREMERE AVENUE") &
            (survey_list["Post Code"].isin(["DE7 4FB", "DE7 4EZ"])),
            "Street / Block Name"
        ] = "WINDERMERE AVENUE"

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "MACDONALD SQAURE", "MACDONALD SQUARE"
        )

        return survey_list

    @staticmethod
    def correct_ha15_survey_list(survey_list):
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive"
        )

        return survey_list

    @staticmethod
    def correct_ha16_survey_list(survey_list):
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == "REEDS RD",
            "Reeds ROAD",
            survey_list["Street / Block Name"]
        )
        # Replace " rd " with "road"
        survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\brd\b', 'road',
                                                                                            regex=True)

        # Replace " , " with ", "
        survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(
            " , ", ', ',
        )
        # Fix "{place} ,{place}" with "{place}, {place}"
        survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\s*,\s*', ', ',
                                                                                            regex=True)
        # Strip whitespace
        survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip()

        # Correct errors
        survey_list["Post Code"] = np.where(
            survey_list["Post Code"] == "M38 0SA",
            "M38 9SA",
            survey_list["Post Code"]
        )

        survey_list["Post Code"] = np.where(
            (survey_list["Street / Block Name"] == "nelson drive") & (survey_list["Post Code"] == "M44 5JE"),
            "M44 5JF",
            survey_list["Post Code"]
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road",
                                                                                            "chatley road")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road",
                                                                                            "plantation avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("how clough drive",
                                                                                            "howclough drive")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane",
                                                                                            "brookhurst lane")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("biirch road",
                                                                                            "birch road")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hadson road",
                                                                                            "hodson road")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("harbonne avennue",
                                                                                            "narbonne avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "cumberland road, cadishead",
            "cumberland avenue, cadishead")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("aston field drive",
                                                                                            "ashton field drive")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wedgewood road",
                                                                                            "wedgwood road")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hamilton close",
                                                                                            "hamilton avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "lichens crescent, fitton hill",
            "lichens crescent")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill",
                                                                                            "south croft")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(", fitton hill", "")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("firtree dr",
                                                                                            "fir tree avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hawthorne road",
                                                                                            "hawthorn crescent")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue",
                                                                                            "reins lee avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("westerhill road",
                                                                                            "wester hill road")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("st martins road",
                                                                                            "saint martins road")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("timperley avenue",
                                                                                            "timperley close")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eastwood road",
                                                                                            "eastwood avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("new road", "new street")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grassmere road",
                                                                                            "grasmere road")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hulton road",
                                                                                            "hulton avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("beechfield avenue",
                                                                                            "beechfield road")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue",
                                                                                            "princes avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent",
                                                                                            "edge fold crescent")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("conniston avenue",
                                                                                            "coniston avenue")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent",
                                                                                            "blackthorn crescent")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road",
                                                                                            "wellstock lane")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brackley avenue",
                                                                                            "brackley street")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brook avenue swinton",
                                                                                            "brook avenue, swinton")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("green avenue swinton",
                                                                                            "green avenue, swinton")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grasmere avenue wardley",
                                                                                            "grasmere avenue, wardley")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle",
                                                                                            "mardale avenue, wardle")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("carleach grove",
                                                                                            "cartleach Grove")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("arbour grove",
                                                                                            "arbor Grove")

        # Replacement for clively avenue 66-68
        survey_list["NO."] = np.where(
            survey_list["NO."] == "66-68",
            "66",
            survey_list["NO."]
        )

        # Delete some duplicated entries
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "york road") &
              (survey_list["NO."].astype(str) == "12") &
              (survey_list["Post Code"] == "M44 5HU") &
              (survey_list["SUBMISSION DATE"].astype(str) == "45229"))
        ]

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "peatfield avenue") &
              (survey_list["NO."].astype(str) == "23") &
              (survey_list["Post Code"] == "M27 9XG") &
              (survey_list["SUBMISSION DATE"].astype(str) == "45236"))
        ]

        return survey_list

    @staticmethod
    def correct_ha24_survey_list(survey_list):
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip()

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "council house, nidds lane", "nidds lane"
        )
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "wirral avenue", "wirrall avenue"
        )
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "st ives road", "st. ives crescent"
        )
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "sundringham road", "sandringham road"
        )
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "milton avenue", "milton road"
        )
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "st ives crescent", "st. ives crescent"
        )
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "council house, waterbelly lane", "waterbelly lane"
        )
        # Generally remove "councile house, " from the start of the street name
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "council house, ", ""
        )
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "st. leodegars close", "st leodegars close"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "montgomery crescent", "montgomery road"
        )

        return survey_list

    @staticmethod
    def correct_ha28_survey_list(survey_list):
        # Rename the "No" column to "No." to align with the other survey sheets
        survey_list = survey_list.rename(columns={"NO ": "NO."})

        survey_list["Post Code"] = np.where(
            survey_list["Post Code"] == "ME75HA",
            "ME7 5HA",
            survey_list["Post Code"]
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ANDREW MANOR/BRITTON ST", "ANDREW MANOR"
        )

        survey_list["Post Code"] = np.where(
            survey_list["Post Code"] == "ME75TW",
            "ME7 5TW",
            survey_list["Post Code"]
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ST MARKS HOUSE/SAXON ST", "ST MARKS HOUSE"
        )

        return survey_list

    @staticmethod
    def correct_ha38_survey_list(survey_list):
        # Rename the "No" column to "No." to align with the other survey sheets
        survey_list = survey_list.rename(columns={"NO ": "NO."})

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            'Kingsford Court, Coombe Valley Road', 'Kingsford Court'
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            'LESLIE TEW COURT/DERWENT ROAD', 'LESLIE TEW COURT'
        )

        # There is no 18A LESLIE TEW COURT in the asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "LESLIE TEW COURT") &
              (survey_list["Post Code"] == "TN10 3TX") &
              (survey_list["NO."] == "18A"))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            'Brindley House, Wellbeck Road', 'Brindley House'
        )

        # Try taking just the first part of the string, splitting on a /
        survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split('/').str[0].str.strip()

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            'HUNTSMAN WAY', 'HUNTSMANS WAY'
        )

        # Try taking just the first part of the string, splitting on a ,
        survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split(',').str[0].str.strip()

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "McCLAREN COURT", "MCLAREN COURT"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ST JAMES CLOISTERS", "ST. JAMES'S CLOISTERS"
        )

        survey_list["Street / Block Name"] = np.where(
            ((survey_list["NO."].isin(
                [
                    "FLAT 1 22",
                    "FLAT 2 22",
                    "FLAT 3 22",
                    "FLAT 4 22",
                    "FLAT 5 22",
                    "FLAT 6 22",
                ]
            )) &
             (survey_list["Street / Block Name"] == "MELTON ROAD")),
            "22 MELTON ROAD",
            survey_list["Street / Block Name"]
        )

        survey_list["Street / Block Name"] = np.where(
            ((survey_list["NO."].isin(
                [
                    "FLAT 1 24",
                    "FLAT 2 24",
                    "FLAT 3 24",
                    "FLAT 4 24",
                    "FLAT 5 24",
                    "FLAT 6 24",
                ]
            )) &
             (survey_list["Street / Block Name"] == "MELTON ROAD")),
            "24 MELTON ROAD",
            survey_list["Street / Block Name"]
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "TURRETT GREEN COURT SILENT STREET", "TURRET GREEN COURT"
        )

        # Turret green court flat 1 doesn't exist in the asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "TURRET GREEN COURT") &
              (survey_list["NO."] == 1))
        ]
        # 3, 45 raywell steet doesn't exist in the asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "45 RAYWELL STREET") &
              (survey_list["NO."] == 3))
        ]

        # 40 Avondale drive doesn't exist in the asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Avondale Drive") &
              (survey_list["NO."] == 40))
        ]
        # 17A beech road has the wrong postcode
        survey_list["Post Code"] = np.where(
            (survey_list["Street / Block Name"] == "BEECH ROAD") &
            (survey_list["Post Code"] == "DH6 1JD"),
            "DH6 1JB",
            survey_list["Post Code"]
        )

        survey_list["Street / Block Name"] = np.where(
            (survey_list["Street / Block Name"] == "SOUTHVIEW") &
            (survey_list["Post Code"] == "DL16 7DF"),
            "SOUTH VIEW",
            survey_list["Street / Block Name"]
        )

        survey_list["Post Code"] = np.where(
            (survey_list["Street / Block Name"] == "BEECH ROAD") &
            (survey_list["Post Code"] == "DH6 1JD"),
            "DH6 1JB",
            survey_list["Post Code"]
        )

        return survey_list

    @staticmethod
    def correct_ha32_survey_list(survey_list):
        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == "Coxwold",
            "Coxwold Grove",
            survey_list["Street / Block Name"]
        )

        # Update the Barringhton Avenue with their correct spelling: Barrington Avenue
        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == "Barringhton Avenue",
            "Barrington Avenue",
            survey_list["Street / Block Name"]
        )

        # Update how the Rustenburn addresses are listed in the identified addresses
        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == "Rustenburg",
            "Rustenburg Street",
            survey_list["Street / Block Name"]
        )

        # Update how the MALIN LODGE, RONALDSWAY CLOSE addresses are listed in the identified addresses
        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == "MALIN LODGE, RONALDSWAY CLOSE",
            "Malin Lodge",
            survey_list["Street / Block Name"]
        )

        # Update how the Feroes Close are listed in the identified addresses
        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == "Feroes Close",
            "Faroes Close",
            survey_list["Street / Block Name"]
        )

        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == 'FORESTER  WAY',
            'FORESTER WAY',
            survey_list["Street / Block Name"]
        )

        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == '6 Zeigfeld',
            'Ziegfeld Court',
            survey_list["Street / Block Name"]
        )

        # Malin Lodge, Ronaldsway Close
        survey_list["Street / Block Name"] = np.where(
            survey_list["Street / Block Name"] == 'Malin Lodge, Ronaldsway Close',
            'Malin Lodge',
            survey_list["Street / Block Name"]
        )

        return survey_list

    @staticmethod
    def correct_ha50_survey_list(survey_list):

        survey_list["Post Code"] = np.where(
            (survey_list["Street / Block Name"] == 'COSELEY STREET') &
            (survey_list["Post Code"] == 'ST16 1LR'),
            "ST6 1JU",
            survey_list["Post Code"]
        )

        # Remove some of COSELEY STREET, as we have surveys done, outside of the asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "COSELEY STREET") &
              (survey_list["Post Code"] == "ST6 1JU") &
              (survey_list["NO."].isin([96])))
        ]

        survey_list["Post Code"] = survey_list["Post Code"].str.replace("ST33JZ", "ST3 3JZ")

        # Remove some of Jesmond drive as we have surveys done outside of the asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Jesmond Drive") &
              (survey_list["Post Code"] == "ST3 3JZ") &
              (survey_list["NO."].isin([29])))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "BRUNDELL OVAL", "BRUNDALL OVAL"
        )

        # Remove 4 Linden Place
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Linden Place") &
              (survey_list["Post Code"] == "ST3 3AT") &
              (survey_list["NO."].isin([4])))
        ]

        # Remove 11 Tilehurst Place
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Tilehurst Place") &
              (survey_list["Post Code"] == "ST3 3AP") &
              (survey_list["NO."].isin([11])))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "deavile road", "DEAVILLE ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "WOOLISCROFT ROAD", "WOOLLISCROFT ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Leak Road", "Leek Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Springfield road", "Springfields road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "MILLWARD RD", "MILLWARD ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "REPINGTON RD", "REPINGTON ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ECCELSTONE PLACE", "ECCLESTONE PLACE"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "St. James Place", "St James Place"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "CHELL HEATH RD", "CHELL HEATH ROAD"
        )
        # Correct postcode
        survey_list["Post Code"] = np.where(
            (survey_list["Street / Block Name"] == 'CHELL HEATH ROAD') &
            (survey_list["Post Code"] == 'ST6 6HU'),
            "ST6 6HJ",
            survey_list["Post Code"]
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Franklin Rd", "Franklin Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Lodge Rd", "Lodge Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "St Matthews Street", "St Matthew Street"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Grove Bank Road", "Grovebank Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "OVERSLEY RD", "OVERSLEY ROAD"
        )

        # Replace all of the " RD" with " ROAD"
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            " RD", " ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "St. Georges Crescent", "St Georges Crescent"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Tewson Road", "Tewson Green"
        )

        # Remove 55 Seabridge Lane
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Seabridge Lane") &
              (survey_list["Post Code"] == "ST5 4AG") &
              (survey_list["NO."].isin([55])))
        ]

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Tyne Way") &
              (survey_list["Post Code"] == "ST5 4AX") &
              (survey_list["NO."].isin([56])))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "St.Bernards Place", "St Bernard Place"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Penarth Road", "Penarth Grove"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "St. Marys Road", "St Marys Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Larch Drive", "Larch Grove"
        )

        # Drop 31 Lauder place north, as there is a duplicate. THis version also has a wrong postcode
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "LAUDER PLACE NORTH") &
              (survey_list["Post Code"] == "ST20QS") &
              (survey_list["NO."].isin([31])))
        ]

        # Handle dropping of dupes
        survey_list["street_pruner"] = survey_list["Street / Block Name"].str.lower().str.replace(" ", "")
        survey_list["postcode_pruner"] = survey_list["Post Code"].str.lower().str.replace(" ", "")

        # Should go to 18
        survey_list = survey_list.drop_duplicates(["NO.", "street_pruner", "postcode_pruner"])
        survey_list = survey_list.drop(columns=["street_pruner", "postcode_pruner"])

        return survey_list

    @staticmethod
    def correct_ha107_survey_list(survey_list):
        # Replace Front Street, East Stockham with Front Street, East Stockwith
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Front Street, East Stockham", "Front Street, East Stockwith"
        )

        # Replace "HONEYHOLE L;ANE" with "HONEYHOLES LANE"
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "HONEYHOLE L;ANE", "HONEYHOLES LANE"
        )

        # Replace "Croft Lane Cherry Willingham, Lincoln" with "Croft Lane, Cherry Willingham, Lincoln"
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Croft Lane Cherry Willingham, Lincoln", "Croft Lane, Cherry Willingham, Lincoln"
        )

        # Replace "Snelland Road Wickenby, Lincoln" with "Snelland Road, Wickenby, Lincoln"
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln"
        )

        # Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln"
        )

        # Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln"
        )

        # Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln"
        )

        # Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln"
        )

        # Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln"
        )

        # Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln"
        )

        # Replace SPRINKHILL ROAD with SPINKHILL ROAD
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "SPRINKHILL ROAD", "SPINKHILL ROAD"
        )

        return survey_list

    @staticmethod
    def correct_ha41_survey_list(survey_list):
        return survey_list

    @staticmethod
    def correct_ha12_survey_list(survey_list):
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Henstone Road", "Hanstone Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Lindern avenue", "Linden Avenue"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "priness way", "Princess Way"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Worth Crecesent", "Worth Crescent"
        )

        survey_list["Post Code"] = survey_list["Post Code"].str.replace(
            "DY117HA", "DY11 7HA"
        )

        survey_list["Post Code"] = survey_list["Post Code"].str.replace(
            "DY117HF", "DY11 7HF"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Adderbrook Crescent", "Addenbrooke Crescent"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Kinver Road", "Kinver Avenue"
        )

        return survey_list

    @staticmethod
    def correct_ha13_survey_list(survey_list):

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Woodfarm Road", "WOOD FARM ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ALLANDALE ROAD", "ALLANDALE"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "NEWFIELDS LANE", "NEWFIELD LANE"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "BROADFIELDS ROAD", "BROADFIELD ROAD"
        )

        survey_list["Post Code"] = survey_list["Post Code"].str.replace(
            "HP2 5SF+", "HP2 5SF"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "PESCOTT HILL", "PESCOT HILL"
        )

        # This is a duplicate record
        survey_list = survey_list[
            ~((survey_list["NO."] == 33) &
              (survey_list["Street / Block Name"] == "Turners Hill") &
              (survey_list["Post Code"] == "HP2 4LH") &
              (survey_list["INSTALLED OR CANCELLED"] == "NO UPDATE - CHECKED 18.12.23"))
        ]

        return survey_list

    @staticmethod
    def correct_ha18_survey_list(survey_list):
        return survey_list

    @staticmethod
    def correct_ha35_survey_list(survey_list):
        return survey_list

    @staticmethod
    def correct_ha34_survey_list(survey_list):
        # Note in the asset list
        survey_list = survey_list[
            survey_list["Post Code"] != "L5 3SS"
            ]

        survey_list["Post Code"] = survey_list["Post Code"].str.replace(
            "L177DR", "L17 7DR"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "PENVALLEY CRESENT", "Penvalley Crescent"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "PENLINKEN DRIVE", "Penlinken Drive"
        )

        # There's no 32 Penlinken Drive in the asset sheet
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Penlinken Drive") &
              (survey_list["NO."] == 32))
        ]

        # There's no 30 Gwent Street in the asset sheet
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "GWENT ST") &
              (survey_list["NO."] == 30))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "POULTON RD", "Poulton Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ST PAULS RD", "St Pauls Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "BROAD LANE, KIRKBY", "BROAD LANE"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "BULLENS RD, KIRKBY", "Bullens Road"
        )

        # There's no 219 NORTH HILL ST in the asset sheet
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "NORTH HILL ST") &
              (survey_list["NO."] == 219))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "CROSLAND RD, KIRKBY", "CROSLAND ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "PARK BROW DRIVE, KIRKBY", "Park Brow Drive"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "CELTIC TREET", "Celtic Street"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "BUCKLAND ROAD", "Buckland Street"
        )

        # duplicates
        survey_list = survey_list.drop_duplicates(["Street / Block Name", "NO.", "Post Code"])

        # This is a duplicate with wrong postcode
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "CLARIBEL STREET") &
              (survey_list["NO."] == 7) &
              (survey_list["Post Code"] == "L8 8AF"))
        ]

        survey_list["NO."] = np.where(
            ((survey_list["NO."] == "187 A") &
             (survey_list["Post Code"] == "L32 6QF")),
            "187A",
            survey_list["NO."]
        )

        return survey_list

    @staticmethod
    def correct_ha56_survey_list(survey_list):
        # Not in asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Samual Street") &
              (survey_list["NO."].isin([22, 24])) &
              (survey_list["Post Code"] == "WA5 1BB"))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "STOURTON RD", "Stourton Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "BIRKIN RD", "Birkin Road"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "PORTLAND RD", "Portland Road"
        )

        # We remove a row, because two rows match to a block listing
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Tavlin Avenue") &
              (survey_list["NO."] == 17) &
              (survey_list["Post Code"] == "WA5 0EN"))
        ]

        return survey_list

    @staticmethod
    def correct_ha30_survey_list(survey_list):

        survey_list = survey_list[~pd.isnull(survey_list["Post Code"])]

        # Split on / and take the first half
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0]

        # Not in the asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Horsebridge Road") &
              (survey_list["NO."] == 286))
        ]

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "DUTTON WAY") &
              (survey_list["NO."] == 9))
        ]

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "PAYTHORNE CLOSE") &
              (survey_list["NO."] == 10))
        ]

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "MARCHWOOD ROAD") &
              (survey_list["NO."] == 11))
        ]

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Otterburn Close") &
              (survey_list["NO."] == 4))
        ]

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Blossom Court") &
              (survey_list["NO."] == 5))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "St LUKES CLOSE , HUNTINGDON", "St. Lukes Close"
        )

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "St. Lukes Close") &
              (survey_list["NO."].isin([4, 7, 8])))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ROMAN WAY , GODMANCHESTER , HUNTINGDON", "Roman Way"
        )

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Roman Way") &
              (survey_list["NO."].isin([58])))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "HEADLANDS , FENSTANTON , HUNTINGDON", "Headlands Fenstanton"
        )

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Headlands Fenstanton") &
              (survey_list["NO."].isin([126, 134])))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "WALLACE COURT , HUNTINGDON", "Wallace Court"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "CRICKETERS WAY , CHATTERIS", "Cricketers Way"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Jubilee Gardens", "Jubilee Green"
        )

        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "Harrow Road") &
              (survey_list["NO."].isin([10])))
        ]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ST LUKES CLOSE", "St. Lukes Close"
        )

        return survey_list

    @staticmethod
    def correct_ha49_survey_list(survey_list):
        return survey_list

    @staticmethod
    def correct_ha8_survey_list(survey_list):
        # Split on / and take the first half
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0]

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "WESTONIA COURT HOUSE", "Westonia Court"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Hillesdon Avenue", "Hillesden Avenue"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Weston Street", "Western Street"
        )

        # Remove placeholder rows where postcode is missing
        survey_list = survey_list[
            ~pd.isnull(survey_list["Post Code"])
        ]

        return survey_list

    @staticmethod
    def correct_ha11_survey_list(survey_list):
        # Remove 39 HOLLYWOOD WAY as it's not in the asset list
        survey_list = survey_list[
            ~((survey_list["Street / Block Name"] == "HOLLYWOOD WAY") &
              (survey_list["NO."] == 39))
        ]
        return survey_list

    @staticmethod
    def correct_ha42_survey_list(survey_list):
        # original asset list has nothing in the street
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Turnstone Terrace", ""
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Pegasus place", ""
        )
        return survey_list

    @staticmethod
    def correct_ha45_survey_list(survey_list):
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Norwich Road", "Norwich Avenue"
        )
        return survey_list

    @staticmethod
    def correct_ha51_survey_list(survey_list):
        survey_list = survey_list.rename(columns={"NO ": "NO."})

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Autum Close", "Autumn Close"
        )

        return survey_list

    @staticmethod
    def correct_ha52_survey_list(survey_list):
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Mardalle Avenue", "Mardale Avenue"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Ollerton  Close, Grappenhall", "Ollerton Close"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Bradshaw Road, Grappenhall", "Bradshaw Lane"
        )

        # Drop a bunch of dupes
        survey_list = survey_list.drop_duplicates(["NO.", "Street / Block Name", "Post Code"])

        return survey_list

    @staticmethod
    def correct_ha5_survey_list(survey_list):
        return survey_list

    @staticmethod
    def correct_ha20_survey_list(survey_list):
        # Not in the asset list
        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Abbot Close", "ABBOTS CLOSE"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Downbarns Road", "DOWN BARNS ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "Austin Lane", "AUSTINS LANE"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "South Park Way", "SOUTHPARK WAY"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "OAKLAND ROAD", "OAKWOOD ROAD"
        )

        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
            "ACRE WAY/NORTHWOOD", "ACRE WAY"
        )

        return survey_list

    @staticmethod
    def levenstein_match(matching_string, df):
        match_to = df["matching_address"].tolist()
        # Strip out punctuation and spaces
        match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
        match_to = [x.replace(" ", "") for x in match_to]

        # Perform matching between full key and match_to
        distances = [100 - fuzz.ratio(matching_string, s) for s in match_to]

        best_match_index = distances.index(min(distances))
        # We might want to consider a threshold for the distance, however for the momeny,
        # we don't consider this for the moment
        df = df.iloc[best_match_index:best_match_index + 1]

        return df

    def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):

        # Correct the survey list
        survey_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_survey_list")
        survey_list = survey_list_correction_function(survey_list)

        missed_postcodes = []
        if ha_name in ["HA6", "HA34"]:
            missed_postcodes = [
                postcode.lower() for postcode in survey_list["Post Code"] if
                postcode.lower() not in asset_list["matching_postcode"].values
            ]

        if ha_name == "HA13":
            missed_postcodes = ["hp17 8le"]

        if ha_name == "HA56":
            # Multiple properties are listed as blocks, which is a problem for matching
            missed_postcodes = ["sk17 6nr", "wa5 0en"]

        matching_lookup = []
        for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):

            house_number = row["NO."]
            if isinstance(house_number, str):
                house_number = house_number.lower().strip()

            # Filter on the first line of the address
            df = asset_list[
                asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip())
            ].copy()

            if not any(df["matching_address"].str.contains(str(house_number))):
                if "flat" in str(house_number):
                    house_number = house_number.split("flat")[1].strip()

                # We check if we had an instance of flat x, y
                if "," in str(house_number):
                    house_number = house_number.split(",")[0].strip()

                # We may also have a space for an instance of flat x y
                if " " in str(house_number):
                    house_number = house_number.split(" ")[0].strip()

            df = df[df["matching_address"].str.contains(str(house_number))]

            if df.empty:

                postcode_lower = row["Post Code"].lower()
                if postcode_lower in missed_postcodes:
                    matching_lookup.append(
                        {
                            "survey_list_row_id": row["survey_list_row_id"],
                            "asset_list_row_id": None,
                        }
                    )
                    continue

                print(row["Street / Block Name"])
                print(house_number)
                print(row["Post Code"])
                raise ValueError("Investigate")

            if df.shape[0] != 1:
                df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
                if df.shape[0] != 1:
                    df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())]

                    if df.empty:

                        postcode_lower = row["Post Code"].lower()
                        if postcode_lower in missed_postcodes:
                            matching_lookup.append(
                                {
                                    "survey_list_row_id": row["survey_list_row_id"],
                                    "asset_list_row_id": None,
                                }
                            )
                            continue

                    if df.shape[0] != 1:
                        if "Town/Area" not in row.keys():
                            full_key = (str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() +
                                        row["Post Code"].lower().strip())
                        else:
                            full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + \
                                       row["Town/Area"].lower().strip() + row["Post Code"].lower().strip()
                        # Remove any spaces from the full key
                        full_key = full_key.replace(" ", "")

                        df = self.levenstein_match(full_key, df)

                        if df.shape[0] != 1:
                            print(row["Street / Block Name"])
                            print(house_number)
                            print(row["Post Code"])
                            raise ValueError("Investigate")

            matching_lookup.append(
                {
                    "survey_list_row_id": row["survey_list_row_id"],
                    "asset_list_row_id": df["asset_list_row_id"].values[0],
                }
            )

        matching_lookup = pd.DataFrame(matching_lookup)

        if matching_lookup.shape[0] != survey_list.shape[0]:
            raise ValueError("Mismatch in the number of survey rows and matching lookup rows")

        matching_lookup = matching_lookup[~pd.isnull(matching_lookup["asset_list_row_id"])]

        if matching_lookup["asset_list_row_id"].duplicated().sum():
            raise ValueError("Duplicated matches in survey list")

        # Merge onto the survey list
        survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id")

        # TEMP FOR NEWER WORK
        # matching_lookup = matching_lookup.merge(
        #     asset_list[["asset_list_row_id", "UPRN"]], how="left", on="asset_list_row_id"
        # ).merge(
        #     survey_list[["survey_list_row_id", "NO.", "Street / Block Name", "Post Code"]],
        #     how="left", on="survey_list_row_id"
        # )
        # matching_lookup.to_csv(
        #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane/surveys_to_assets.csv"
        # )

        return survey_list

    @staticmethod
    def correct_ha25_eco3_list(eco3_list):
        # NEADS DRIVE, postcode with bs305dt, is not found in the asset list
        eco3_list = eco3_list[
            ~(eco3_list["Post Code"] == "BS305DT")
        ]
        # Drop rows with missings postcode
        eco3_list = eco3_list[
            ~pd.isnull(eco3_list["Post Code"])
        ]
        # We have a bunch of genuine duplicates
        eco3_list = eco3_list.drop_duplicates(["NO ", "Street / Block Name", "Post Code"])

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "HALWILL MEADOOW", "HALWILL MEADOW"
        )

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "Hall Road", "Hall Rd"
        )

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "SPRINGFIELD WAY SAINT DAY", "SPRINGFIELD WAY ST DAY"
        )
        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "BOND SPEAR COURT", "BOND-SPEAR COURT"
        )
        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "ST.MARYS HILL", "ST MARYS HILL"
        )
        # Correct the postcode for edmund road
        eco3_list["Post Code"] = np.where(
            (eco3_list["Street / Block Name"] == "EDMUND ROAD") &
            (eco3_list["Post Code"] == "TR14 8QJ"),
            "TR15 1BY",
            eco3_list["Post Code"]
        )
        return eco3_list

    @staticmethod
    def correct_ha50_eco3_list(eco3_list):
        return eco3_list

    @staticmethod
    def correct_ha41_eco3_list(eco3_list):
        return eco3_list

    @staticmethod
    def correct_ha63_eco3_list(eco3_list):
        eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]
        # Some postcode that aren't in the asset list
        eco3_list = eco3_list[
            ~eco3_list["Post Code"].isin(
                ["NR32 15X", "NR30 2BT"]
            )
        ]

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "POUND COTTAGES - BLOOMSBERRY CLOSE", "POUND COTTAGES"
        )

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "FREDRICK ROAD", "Frederick Road"
        )

        # For denmark street, remove the space from the house number
        eco3_list["NO "] = np.where(
            eco3_list["Street / Block Name"] == "DENMARK STREET",
            eco3_list["NO "].str.replace(" ", ""),
            eco3_list["NO "]
        )

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "OLD HOSPITAL MEWS HOSPITAL WALK", "Old Hospital Mews"
        )

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "Portland House, Portland Street", "Portland House"
        )

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "MIDDLE MARKET STREET", "Middle Market Road"
        )

        return eco3_list

    @staticmethod
    def correct_ha117_eco3_list(eco3_list):
        # Delete rows where postcode is null - there are some placeholder rows where this happens
        eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "TARRING ROAD", "155 TARRING ROAD"
        )

        return eco3_list

    @staticmethod
    def correct_ha56_eco3_list(eco3_list):
        eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "Mount Pleasant, Crewe", "Mount Pleasant"
        )

        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "Dutton Close", "Dutton Way"
        )

        eco3_list["Post Code"] = eco3_list["Post Code"].str.replace(
            "Ls63nl", "LS6 3NL"
        )

        # Handle a duplicate
        eco3_list = eco3_list[
            ~((eco3_list["Street / Block Name"] == "Mount Pleasant") &
              (eco3_list["Post Code"] == "CW1 3JF") &
              (eco3_list["NO "] == 5) &
              (eco3_list["INSTALL/ CANCELLATION DATE"] == "CANCELLED 20.5.2022"))
        ]

        return eco3_list

    @staticmethod
    def correct_ha51_eco3_list(eco3_list):
        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "HASELEMERE AVENUE", "HASLEMERE AVENUE"
        )
        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "THORVILLE GROVE", "THORNVILLE GROVE"
        )
        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
            "MONTBRETA CLOSE", "MONTBRETIA CLOSE"
        )
        eco3_list["Post Code"] = np.where(
            (eco3_list["Street / Block Name"] == "SYDENHAM ROAD") &
            (eco3_list["Post Code"] == "CR0 2DW"),
            "CR0 2ED",
            eco3_list["Post Code"]
        )
        # Not in asset list
        eco3_list = eco3_list[
            ~((eco3_list["Street / Block Name"] == "WOODLEY LANE") &
              (eco3_list["Post Code"] == "SM5 2RJ") &
              (eco3_list["NO "] == "FLAT 3, 11"))
        ]

        eco3_list["NO "] = np.where(
            (eco3_list["NO "] == "47 B"),
            "47B",
            eco3_list["NO "]
        )

        return eco3_list

    def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):

        eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list")
        eco3_list = eco3_list_correction_function(eco3_list)

        asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower()
        eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "")

        if ha_name in ["HA25", "HA56", "HA51"]:
            # HA25: 317 -> 259
            missed_postcodes = {
                postcode for postcode in eco3_list["postcode_no_space"] if
                postcode not in asset_list["matching_postcode_nospace"].values
            }

            eco3_list = eco3_list[~eco3_list["postcode_no_space"].isin(missed_postcodes)]

        # For the asset list, we create a matching address without any punctuation
        # TODO: We should generally just remove puncutation from addresses when matching
        asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(
            r'[^\w\s]', '', regex=True
        )
        # Remove double spaces
        asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace(
            "  ", " "
        )

        matching_lookup = []
        missed = []
        for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
            # if row["eco3_list_row_id"] == "HA51_Eco3_22":
            #     raise Exception()
            postcode = row["postcode_no_space"]

            # df will never be empty, since we've already done a check for common postcodes
            df = asset_list[
                asset_list["matching_postcode_nospace"].str.contains(postcode)
            ]

            house_number = row["NO "]
            if isinstance(house_number, str):
                house_number = house_number.lower().strip()

            if not any(df["HouseNo"].str.contains(str(house_number))):
                if "flat" in str(house_number):
                    house_number = house_number.split("flat")[1].strip()

                # We check if we had an instance of flat x, y
                if "," in str(house_number):
                    house_number = house_number.split(",")[0].strip()

                # We may also have a space for an instance of flat x y
                if " " in str(house_number):
                    house_number = house_number.split(" ")[0].strip()

            # We must do the house number filter
            df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]

            # Perform a search on streetname
            # We do this to prevent duplicate matches to properties with the same postcode and house number,
            # but different streets
            street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0]
            street_name_section1 = re.sub(r'[^\w\s]', '', street_name_section1)
            df = df[df["matching_address_no_punctuation"].str.contains(street_name_section1)]

            if df.empty:
                missed.append(row["eco3_list_row_id"])
                continue

            if df.shape[0] > 1:
                if "flat" in str(row["NO "]).lower():
                    df = df[df["matching_address"].str.contains("flat")]
                else:
                    df = df[~df["matching_address"].str.contains("flat")]

            if df.shape[0] != 1:
                print(row["Street / Block Name"])
                print(house_number)
                print(row["Post Code"])
                raise ValueError("Investigate")

            matching_lookup.append(
                {
                    "eco3_list_row_id": row["eco3_list_row_id"],
                    "asset_list_row_id": df["asset_list_row_id"].values[0],
                }
            )

        # We verify the missed
        # HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2
        # where many surveys were conducted on house numbers, not in the asset list
        # 154 missed, 2827 matched for HA 25
        # For HA56, the number of missed is high at 320, however a big portion of these are due to the block being
        # listed in the asset list, and individual units being in the survey list
        if len(missed) != self.UNMATCHED_ECO3[ha_name]:
            raise ValueError(
                f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched"
            )

        matching_lookup = pd.DataFrame(matching_lookup)
        # Check dupes as this will cause problems later on
        if matching_lookup["asset_list_row_id"].duplicated().sum():
            raise ValueError("Duplicated asset list row ids")

        # Merge onto eco3 list
        eco3_list = eco3_list.merge(matching_lookup, how="left", on="eco3_list_row_id")

        asset_list.drop(columns=["matching_address_no_punctuation"], inplace=True)

        return eco3_list

    @staticmethod
    def extract_streetname(address, house_number=None, postcode=None):
        """
        Cleans an address by removing the house number and postcode, and converts everything to lower case.

        :param address: The full address as a string.
        :param house_number: The house number to remove, as a string or integer.
        :param postcode: The postcode to remove, as a string.
        :return: The cleaned address.
        """
        # Convert everything to lower case
        address = address.lower()

        if house_number is not None:
            # Remove the house number
            address = re.sub(r'\b{}\b'.format(house_number), '', address, flags=re.IGNORECASE).strip()

        if postcode is not None:
            # Remove the postcode
            address = re.sub(r'\b{}\b'.format(re.escape(postcode)), '', address, flags=re.IGNORECASE).strip()

        # Get first section before a comma
        address = address.split(",")[0]
        # Additional cleaning to remove extra spaces and commas left over
        address = re.sub(r'\s+', ' ', address)  # Replace multiple spaces with a single space
        address = re.sub(r'\s*,\s*', ', ', address)  # Clean up space around commas

        return address

    def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name):
        matching_lookup = []
        unmatched_addresses = []

        for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)):

            house_number = row["HouseNo"]
            if isinstance(house_number, str):
                house_number = house_number.lower().strip()

            # Filter on the postcode
            df = asset_list[
                asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip())
            ].copy()

            df = df[df["HouseNo"].astype(str) == str(house_number)]
            # For ciga, we skip
            if df.empty:
                unmatched_addresses.append(
                    {
                        "ciga_list_row_id": row["ciga_list_row_id"],
                        "HouseNo": house_number,
                        "Matched Postcode": row["Matched Postcode"]
                    }
                )
                continue

            if df.shape[0] != 1:

                # We split house number and postcode out of the matched address for ciga
                street_name = self.extract_streetname(
                    address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"]
                )
                # We check if any of the rows contains the street name and if they do, filter
                if any(df["matching_address"].str.replace(",", "").str.contains(street_name)):
                    df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]

                if df.shape[0] != 1:
                    # The final check we do here is to check for the presence of flat in the address
                    if "flat" in row["Matched Address"].lower():
                        df = df[df["matching_address"].str.contains("flat")]
                    else:
                        df = df[df["matching_address"].str.contains("flat") == False]

                    if df.shape[0] != 1:
                        full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[
                            "Matched Postcode"].lower().strip()
                        # Remove any spaces from the full key
                        full_key = full_key.replace(" ", "")
                        df = self.levenstein_match(full_key, df)

                    if df.shape[0] != 1:
                        print(row["Street / Block Name"])
                        print(house_number)
                        print(row["Post Code"].lower())
                        raise ValueError("Investigate")

            matching_lookup.append(
                {
                    "ciga_list_row_id": row["ciga_list_row_id"],
                    "asset_list_row_id": df["asset_list_row_id"].values[0],
                }
            )

        # We have an acceptable number of ciga failures for each HA
        if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
            raise ValueError(
                f"Unmatched addresses for {ha_name} is not as expected, got {len(unmatched_addresses)} unmatched")

        matching_lookup = pd.DataFrame(matching_lookup)

        # Check dupes as this will cause problems later on
        if matching_lookup["asset_list_row_id"].duplicated().any():
            raise ValueError("Duplicated asset list row ids")

        # Merge onto the ciga list
        ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id")

        return ciga_list

    @staticmethod
    def identify_built_form_ha6(property_string):
        """
        Identify the built form of a property from the given string.

        :param property_string: The string describing the property
        :return: The identified built form, or None if it cannot be identified
        """
        # Define keywords for each built form
        built_forms = {
            'Semi-Detached': ['semi detached'],
            'Detached': ['detached'],
            'Mid-Terrace': ['mid terrace', 'mid town house'],
            'End-Terrace': ['end terrace', 'end town house']
        }

        # Normalize the input string to lower case for comparison
        property_string_normalized = property_string.lower()

        # Search for each built form keyword in the input string
        for built_form, keywords in built_forms.items():
            for keyword in keywords:
                if keyword in property_string_normalized:
                    return built_form

        # Return None if no built form is identified
        return None

    def load(self):

        # Get the december figures, which is just a csv
        self.december_figures = pd.read_csv(self.december_figures_filepath)
        # Remove the spaces in HA Name
        self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "")
        for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]:
            self.december_figures[col] = self.december_figures[col].astype("Int64")

        if self.use_cache and not self.rebuild:
            data = read_pickle_from_s3(
                bucket_name="retrofit-datalake-dev",
                s3_file_name="ha-analysis/batch3-inputs.pickle",
            )
        else:
            data = {}

        for filepath in self.directories:
            ha_name = filepath.split("/")[2]
            if ha_name in data:
                continue
            # Load asset list
            logger.info("Loading data for {}".format(ha_name))
            asset_list, survey_list, ciga_list, eco3_list = self.load_asset_list(
                filepath=filepath,
                ha_name=ha_name,
            )

            data[ha_name] = {
                "asset_list": asset_list,
                "survey_list": survey_list,
                "ciga_list": ciga_list,
                "eco3_list": eco3_list
            }

        self.data = data

        # Cache the data in s3
        # We need to pickle the data and store in s3
        save_pickle_to_s3(
            data=self.data,
            bucket_name="retrofit-datalake-dev",
            s3_file_name="ha-analysis/batch3-inputs.pickle",
        )

    def ha_facts_and_figures(self):
        """
        This function will return a dictionary of facts and figures for each HA
        :return:
        """

        scheme_map = {
            "ECO4": "ECO4",
            "AFFORDABLE WARMTH": "ECO4",
            "ECO4 A/W": "ECO4",
            "ECO4 GBIS (ECO+)": "GBIS",
            "ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS",
            "ECO4 AFFORDABLE WARMTH": "ECO4",
            "Affordable Warmth": "ECO4",
            "ECO4 GBIS (ECO+) JJC UNDER 73m² ": "GBIS",
            "ECO4 PPS": "ECO4",
            "AFFORDABLE WARMTH / REMEDIAL": "ECO4",
            "AFF0RDALE WARMTH": "ECO4",
            "ECO 4 RdSAP CL": "ECO4",
            "Affordable Warmth (R) ": "ECO4",
            "Affordable Warmth ": "ECO4",
            "ECO 4 AFFORDABLE WARMTH": "ECO4",
        }

        # Since it seems like "subject to archetype check" has some failure conditions, for simplicity, we
        # treat these as similar to subject to CIGA, and therefore unconfirmed worked that could fail. There
        # are only a small volume of properties for which we see this
        eco_eligibility_map = {
            "not eligble": "not eligible",
            "eco 4(subject to ciga)": "eco4 (subject to ciga)",
            "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga) (subject to archetype)",
            "eco4 (subject to archetype check)": "eco4 (subject to archetype)",
            "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)",
            "eco4  (subject to ciga)": "eco4 (subject to ciga)",
            "eco4(subject to ciga)": "eco4 (subject to ciga)",
            "eco4 subject to ciga": "eco4 (subject to ciga)",
            "eco4 (subject to archetype/ciga)": "eco4 (subject to ciga) (subject to archetype)",
            "eco4( subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)",
            "eco4 (subject to ciga/ archetype)": "eco4 (subject to ciga) (subject to archetype)",
        }

        ha_facts_and_figures = []
        for ha_name, data_assets in self.data.items():
            asset_list = data_assets["asset_list"].copy()
            survey_list = data_assets["survey_list"].copy()
            ciga_list = data_assets["ciga_list"].copy()
            eco3_list = data_assets.get("eco3_list", pd.DataFrame())

            asset_list_starting_size = asset_list.shape[0]

            # Change the column name if it's ECO eligibility
            asset_list = asset_list.rename(
                columns={
                    "ECO eligibility": "ECO Eligibility",
                    "ECO Eligibilty": "ECO Eligibility",
                },
            )
            # Remove surplus whitespace from the ECO Eligibility column
            asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.strip()
            # Push to lower case
            asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.lower()
            # Remap
            asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].replace(eco_eligibility_map)

            if not ciga_list.empty:
                # We merge on ciga and update the status to reflect if it has failed ciga or not
                # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA
                # check

                ciga_list_to_merge = ciga_list[["asset_list_row_id", "Guarantee"]].copy()
                ciga_list_to_merge = ciga_list_to_merge[~pd.isnull(ciga_list_to_merge["asset_list_row_id"])]

                asset_list = asset_list.merge(ciga_list_to_merge, how='left', on="asset_list_row_id")

                asset_list["ECO Eligibility"] = np.where(
                    (
                        asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) &
                        (asset_list["Guarantee"] == "Yes")
                    ),
                    "failed ciga",
                    asset_list["ECO Eligibility"]
                )

                # We replace any remaining "Subject to CIGA" with pass Ciga
                asset_list["ECO Eligibility"] = np.where(
                    (
                        asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) &
                        (asset_list["Guarantee"] == "No")
                    ),
                    "eco4 - passed ciga",
                    asset_list["ECO Eligibility"]
                )

                asset_list = asset_list.drop(columns=["Guarantee"])

            # Update the asset list with the categorisations and rename changes
            if asset_list.shape[0] != asset_list_starting_size:
                raise ValueError("The asset list has changed in size")

            # If we have eco3 surveys, we set a property to not eligible
            if not eco3_list.empty:
                eco3_list_to_merge = eco3_list[["asset_list_row_id"]].copy()
                eco3_list_to_merge["has_eco3"] = True
                asset_list = asset_list.merge(
                    eco3_list_to_merge, how="left", on="asset_list_row_id"
                )

                if asset_list.shape[0] != asset_list_starting_size:
                    raise ValueError("The asset list has changed in size, when merging on eco3")

                # Any rows that have an eco3 survey are set to not eligible
                asset_list["ECO Eligibility"] = np.where(
                    asset_list["has_eco3"] == True,
                    "not eligible",
                    asset_list["ECO Eligibility"]
                )
                # asset_list = asset_list.drop(columns=["has_eco3"])

            # Report on sales
            sales_report = {}
            if not survey_list.empty:
                scheme_column = survey_list.columns[0]
                # Remap the values in the scheme column
                survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map)
                # We clean up the survey list installation or cancelled
                if "INSTALLED OR CANCELLED" in survey_list.columns:
                    survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower()
                    # Remove all punctuation
                    survey_list["installed_or_cancelled_clean"] = survey_list[
                        "installed_or_cancelled_clean"].str.replace(
                        r'[^\w\s]', '', regex=True
                    )
                    # Remove double spaces
                    survey_list["installed_or_cancelled_clean"] = survey_list[
                        "installed_or_cancelled_clean"].str.replace(
                        r'\s+', ' ', regex=True
                    )
                    # Remove trailing spaces
                    survey_list["installed_or_cancelled_clean"] = survey_list[
                        "installed_or_cancelled_clean"].str.strip()

                    survey_list["installation_status"] = None
                    survey_list["installation_status"] = np.where(
                        survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]),
                        "installed",
                        survey_list["installation_status"]
                    )
                    survey_list["installation_status"] = np.where(
                        survey_list["installed_or_cancelled_clean"].isin(["cancelled"]),
                        "cancelled",
                        survey_list["installation_status"]
                    )
                    # Find partial installations
                    survey_list["installation_status"] = np.where(
                        survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"),
                        "in progress",
                        survey_list["installation_status"]
                    )
                    # Find partial cancellations
                    # TODO: We might have more indications of partial cancellations
                    survey_list["installation_status"] = np.where(
                        survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]),
                        "cancelled",
                        survey_list["installation_status"]
                    )
                else:
                    # We have some examples, e.g. HA28, where we do not have the installed or cancelled column
                    if 'INSTALL/ CANCELLATION DATE' in survey_list.columns:
                        survey_list["installation_status"] = np.where(
                            survey_list['INSTALL/ CANCELLATION DATE'].str.lower().str.contains("cancelled"),
                            "cancelled",
                            "installed",
                        )
                    else:
                        survey_list["installation_status"] = np.where(
                            survey_list['INSTALL / CANCELLATION DATE'].str.lower().str.contains("cancelled"),
                            "cancelled",
                            "installed",
                        )

                # Finally, for other cases, we set the status to "in progress"
                survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress")

                # We concatenate the scheme name with the installation status
                survey_list["installation_status"] = (
                    survey_list[scheme_column] + " - " + survey_list["installation_status"]
                )

                # We get the sales
                sales_report = {
                    "ECO4 - surveys sold": survey_list.shape[0],
                    **survey_list["installation_status"].value_counts().to_dict()
                }

                # We find some cases where properties have sold but are missing CIGA checks
                survey_list_to_merge = survey_list[["asset_list_row_id", "installation_status"]].copy()
                survey_list_to_merge["has_a_survey_record"] = True
                survey_list_to_merge = survey_list_to_merge[~pd.isnull(survey_list_to_merge["asset_list_row_id"])]

                asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id")
                # Update the cases where properties have sold, but are missing a CIGA check
                # If we don't have a CIGA list, we set the value to ECO4
                set_to = "eco4 - passed ciga" if not ciga_list.empty else "eco4"
                asset_list["ECO Eligibility"] = np.where(
                    (asset_list["ECO Eligibility"].str.contains("subject to ciga")) & (
                        asset_list["has_a_survey_record"] == True
                    ),
                    set_to,
                    asset_list["ECO Eligibility"]
                )
                # Update the cases where a property has been marked as eligible for GBIS, but sold for ECO4
                asset_list["ECO Eligibility"] = np.where(
                    (asset_list["ECO Eligibility"] == "gbis") & (
                        asset_list["installation_status"].isin(
                            ["ECO4 - installed", "ECO4 - cancelled", "ECO4 - in progress"]
                        )
                    ),
                    "eco4",
                    asset_list["ECO Eligibility"]
                )
                # Update the cases where a property was marked as eligible for ECO4, but sold for GBIS
                asset_list["ECO Eligibility"] = np.where(
                    (asset_list["ECO Eligibility"].isin(
                        [
                            "eco4",
                            "eco4 (subject to ciga)",
                            "eco4 - passed ciga",
                            "failed ciga",
                            "eco4 (subject to archetype)",
                            "eco4 (subject to ciga) (subject to archetype)"
                        ]
                    )) & (
                        asset_list["installation_status"].isin(
                            ["GBIS - installed", "GBIS - cancelled", "GBIS - in progress"]
                        )
                    ),
                    "gbis",
                    asset_list["ECO Eligibility"]
                )
                # Update the cases where a property is marked as not eligible, but sold for GBIS
                asset_list["ECO Eligibility"] = np.where(
                    (asset_list["ECO Eligibility"] == "not eligible") & (
                        asset_list["installation_status"].isin(
                            ["GBIS - in progress", "GBIS - installed", "GBIS - cancelled"]
                        )),
                    "gbis",
                    asset_list["ECO Eligibility"]
                )

                # Update the cases where a property is marked as not eligible, but sold for ECO4
                asset_list["ECO Eligibility"] = np.where(
                    (asset_list["ECO Eligibility"] == "not eligible") & (
                        asset_list["installation_status"].isin(
                            ["ECO4 - in progress", "ECO4 - installed", "ECO4 - cancelled"]
                        )
                    ),
                    "eco4",
                    asset_list["ECO Eligibility"]
                )

                asset_list = asset_list.drop(columns=["has_a_survey_record", "installation_status"])

                # Update the survey list with installation status
                self.data[ha_name]["survey_list"] = survey_list

            # Insert updated asset list
            self.data[ha_name]["asset_list"] = asset_list

            ha_facts_and_figures.append(
                {
                    "HA Name": ha_name,
                    **asset_list["ECO Eligibility"].value_counts().to_dict(),
                    **sales_report
                }
            )

        ha_facts_and_figures = pd.DataFrame(ha_facts_and_figures)
        ha_facts_and_figures = ha_facts_and_figures.drop(
            columns=["not eligible"]
        )

        ha_facts_and_figures = ha_facts_and_figures.fillna(0)
        # Make all columns apart from HA NAme integers
        for col in ha_facts_and_figures.columns[1:]:
            ha_facts_and_figures[col] = ha_facts_and_figures[col].astype(int)

        ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name")
        ha_facts_and_figures = ha_facts_and_figures.fillna(0)

        self.facts_and_figures = ha_facts_and_figures


def get_property_type_and_built_form(property_meta, ha_name):
    if ha_name in ["HA44"]:
        return None, None

    if ha_name == "HA1":
        property_type = property_meta["Asset Type"]
        # We correct a small error
        if property_type == "a":
            property_type = "House"

        # Remap bedsits to flats
        if property_type in ["Bedsit", "Room"]:
            property_type = "Flat"

        built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"], None)
    elif ha_name == "HA2":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type"].strip())
        built_form = None
    elif ha_name == "HA5":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
        built_form = None
    elif ha_name == "HA6":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]]
        built_form = property_meta["built_form"]
    elif ha_name == "HA7":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["Archetype"])
        built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"])
    elif ha_name == "HA8":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HA9":
        property_description = property_meta["Asset Type"].strip().lower()
        if "house" in property_description:
            return "House", None

        if "flat" in property_description:
            return "Flat", None

        if "bungalow" in property_description:
            return "Bungalow", None

        if "maisonette" in property_description:
            return "Maisonette", None

        return None, None
    elif ha_name == "HA11":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HA12":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset_Type1"].strip())
        built_form = None
    elif ha_name == "HA13":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Type Cd"].strip())
        built_form = None
    elif ha_name == "HA14":
        if property_meta["Asset Type Description"] == "Block - Repair":
            # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address
            if "room" in property_meta["Address 1"].lower():
                property_type = "House"
            else:
                property_type = "Flat"

        else:
            property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][
                property_meta["Asset Type Description"]
            ]

        built_form = None
    elif ha_name == "HA15":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HA16":
        config = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Type"]]
        property_type = config.get("property-type")
        built_form = config.get("built-form")
    elif ha_name == "HA17":
        return property_meta["property_type"], None
    elif ha_name == "HA18":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
        built_form = None
    elif ha_name == "HA19":
        property_type = property_meta["Dwelling Type"]
        built_form = None
    elif ha_name == "HA20":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
        built_form = None
    elif ha_name == "HA21":
        property_description = property_meta["Property Type"].strip().lower()
        if "house" in property_description:
            return "House", None

        if "flat" in property_description:
            return "Flat", None

        if "bungalow" in property_description:
            return "Bungalow", None

        if "maisonette" in property_description:
            return "Maisonette", None

        return None, None
    elif ha_name == "HA24":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HA25":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["T1_AssetType"]]
        built_form = None
    elif ha_name == "HA27":
        property_type = property_meta["Property Type"]
        built_form = None
    elif ha_name == "HA28":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Property Type - Academy"]]
        built_form = None
    elif ha_name == "HA30":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["A_AssetType"]]
        built_form = None
    elif ha_name == "HA31":
        property_description = property_meta["A_AssetType"].strip().lower()
        if "house" in property_description:
            return "House", None

        if "flat" in property_description:
            return "Flat", None

        if "bungalow" in property_description:
            return "Bungalow", None

        if "maisonette" in property_description:
            return "Maisonette", None

        return None, None

    elif ha_name == "HA32":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling type"].strip())
        built_form = None
    elif ha_name == "HA34":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HA35":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type Grouping"].strip())
        built_form = None
    elif ha_name == "HA37":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PROPERTY TYPE"].strip())
        built_form = None
    elif ha_name == "HA39":
        property_type_config = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["ConstructionStyle"], {})
        property_type = property_type_config.get("property_type", None)
        built_form = property_type_config.get("built_form", None)

        if property_type is None:
            # We check for the presence of room or flat
            if "flat" in property_meta["matching_address"]:
                property_type = "Flat"
            else:
                property_type = "House"
    elif ha_name == "HA41":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Archetype"].strip())
        built_form = None
    elif ha_name == "HA42":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling use/type"].strip())
        built_form = None
    elif ha_name == "HA45":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property type"].strip())
        built_form = None
    elif ha_name == "HA48":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HA49":
        property_type = property_meta["Property Class"].strip()
        built_form = None
    elif ha_name == "HA50":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HA51":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
        built_form = None
    elif ha_name == "HA52":
        if property_meta["Property Type"] is None:
            return None, None
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HA54":
        property_type = property_meta["Property Type"]
        built_form = None
    elif ha_name == "HA56":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type Description"].strip())
        built_form = None
    elif ha_name == "HA63":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PropertyType"].strip())
        built_form = None
    elif ha_name == "HA107":
        property_type = property_meta.get("property_type", None)
        built_form = property_meta.get("built_form", None)
    elif ha_name == "HA117":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
        built_form = None
    elif ha_name == "HAXX":
        return property_meta["Property Type"].split(":")[0].strip(), None
    elif ha_name == "HAXXX":
        property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Unit Description"].strip())
        built_form = None
    else:
        raise NotImplementedError("Implement me")

    return property_type, built_form


def get_epc_data(
    loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
):
    if not loader.data:
        raise ValueError("Data not found - please run loader.load() first")

    outputs = {}
    for ha_name, data_assets in loader.data.items():

        if not pull_data:
            # Then we retrieve the data from S3
            processed_ha_results = read_pickle_from_s3(
                bucket_name="retrofit-datalake-dev",
                s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
            )

            outputs[ha_name] = {
                "results_df": processed_ha_results["results_df"],
                "scoring_df": processed_ha_results["scoring_df"],
                "nodata": processed_ha_results["nodata"]
            }
            continue

        # For each HA, we read pull in the data required, and store in S3
        asset_list = data_assets["asset_list"].copy()

        # If the survey list is missing, it means we have no yet completed any surveys and therefore should only
        # consider the most recent EPC
        consider_penultimate_epc = data_assets["survey_list"] is not None

        # We iterate through the asset list and pull what we need
        results = []
        scoring_data = []
        nodata = []
        failed_model_rows = []
        for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):

            if property_meta["matching_postcode"] is None:
                continue

            property_type, built_form = get_property_type_and_built_form(
                property_meta=property_meta, ha_name=ha_name
            )

            searcher = SearchEpc(
                address1=str(property_meta["HouseNo"]),
                postcode=property_meta["matching_postcode"],
                auth_token=EPC_AUTH_TOKEN,
                os_api_key="",
                full_address=property_meta["matching_address"]
            )
            searcher.ordnance_survey_client.property_type = property_type
            searcher.ordnance_survey_client.built_form = built_form
            searcher.find_property(skip_os=True)

            if searcher.newest_epc is None:
                nodata.append(property_meta)
                continue

            if searcher.newest_epc.get("estimated"):
                # We insert the row ID as our proxy for UPRN
                searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])

            newest_epc = searcher.newest_epc
            older_epcs = searcher.older_epcs
            full_sap_epc = searcher.full_sap_epc

            # If we have a survey list, we check the penultimate, because the property might have been installed
            penultimate_epc = newest_epc
            if consider_penultimate_epc:
                # We also want to get the penultimate epc
                penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
                if not penultimate_epc:
                    penultimate_epc = newest_epc

            eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
            eligibility.check_gbis_warmfront()
            eligibility.check_eco4_warmfront()

            # We check the conditions for checking the penultimate epc
            identified_for_gbis = property_meta["ECO Eligibility"] in ["gbis"]
            identified_for_eco4 = property_meta["ECO Eligibility"] in ["eco4"]
            subject_to_ciga = property_meta["ECO Eligibility"] in [
                "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"
            ]

            # condition 1 - identified for gbis and not eligible
            condition_1 = (identified_for_gbis and not eligibility.gbis_warmfront
                           and not eligibility.eco4_warmfront["eligible"]
                           ) & consider_penultimate_epc

            # condition 2 - identified for eco4 and not eligible
            condition_2 = (identified_for_eco4 and not eligibility.eco4_warmfront[
                "eligible"]) & consider_penultimate_epc

            # successfully identigied gbis
            condition_3 = (
                identified_for_gbis and (eligibility.gbis_warmfront or eligibility.eco4_warmfront["eligible"])
            )

            # Nothing identified
            condition_4 = (
                not identified_for_gbis
                and not identified_for_eco4
                and not eligibility.gbis_warmfront
                and not subject_to_ciga
                and not eligibility.eco4_warmfront["eligible"]
            )

            # Not identified but seemingly eligible for eco4 or gbis
            condition_5 = (
                not identified_for_gbis and not identified_for_eco4 and (
                eligibility.eco4_warmfront["eligible"] or eligibility.gbis_warmfront
            )
            )

            condition_6 = (
                subject_to_ciga and not eligibility.eco4_warmfront["eligible"]
            )

            if condition_1 or condition_2:
                # We check the penultimate epc
                eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
                eligibility.check_gbis_warmfront()
                eligibility.check_eco4_warmfront()
                # If this is the case, we need to update the older epcs
                # We don't update just to make data cleaning easier
                if penultimate_epc.get("estimated") is None:
                    older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]]
            elif condition_3 or condition_4 or condition_5 or condition_6:
                pass
            else:
                NotImplementedError("Implement me")

            # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity
            # Loft MUST be suitable
            cavity_age = None
            if (
                identified_for_eco4 and not eligibility.eco4_warmfront["eligible"]
            ):
                # We check the age of the cavity and if it's particularly old, we flag it
                cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned)

            if eligibility.eco4_warmfront["eligible"]:
                if eligibility.epc["uprn"] == "":
                    eligibility.epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
                try:
                    scoring_dictionary = prepare_model_data_row(
                        property_id=property_meta["asset_list_row_id"],
                        modelling_epc=eligibility.epc,
                        cleaned=cleaned,
                        cleaning_data=cleaning_data,
                        created_at=created_at,
                        old_data=older_epcs,
                        full_sap_epc=full_sap_epc,
                        photo_supply_lookup=photo_supply_lookup,
                        floor_area_decile_thresholds=floor_area_decile_thresholds
                    )
                    scoring_data.extend(scoring_dictionary)
                except Exception as e:
                    # If we fail, we just keep a record of it
                    failed_model_rows.append(
                        property_meta["asset_list_row_id"]
                    )

            results.append(
                {
                    "row_id": property_meta["asset_list_row_id"],
                    "uprn": eligibility.epc["uprn"],
                    "is_estimated": searcher.newest_epc.get("estimated") is not None,
                    "property_type": eligibility.epc["property-type"],
                    "eco4_eligible": eligibility.eco4_warmfront["eligible"],
                    "eco4_message": eligibility.eco4_warmfront["message"],
                    "eco4_strict": eligibility.eco4_warmfront["strict"],
                    "gbis_eligible": eligibility.gbis_warmfront["eligible"],
                    "gbis_message": eligibility.gbis_warmfront["message"],
                    "gbis_strict": eligibility.gbis_warmfront["strict"],
                    "sap": float(eligibility.epc["current-energy-efficiency"]),
                    # Property components
                    "roof": eligibility.roof["clean_description"],
                    "walls": eligibility.walls["clean_description"],
                    "cavity_type": eligibility.cavity["type"],
                    "heating": eligibility.epc["mainheat-description"],
                    "tenure": eligibility.tenure,
                    "date_epc": eligibility.epc["lodgement-date"],
                    "loft_thickness": eligibility.roof["insulation_thickness"],
                    "cavity_age": cavity_age,
                    "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"],
                    "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"]
                }
            )

        results_df = pd.DataFrame(results)
        scoring_df = pd.DataFrame(scoring_data)
        results_df["post_install_sap"] = None
        results_df["eligibility_classification"] = None

        if not scoring_df.empty:
            scoring_df = scoring_df.drop(
                columns=[
                    "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
                    "carbon_ending"
                ]
            )

            model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
            model_api.MODEL_PREFIXES = ["sap_change_predictions"]

            scoring_df["id"] = scoring_df["id"] + "phase=0"
            # We split up the scoring_df and score
            predictions = []
            to_loop_over = range(0, scoring_df.shape[0], 400)
            for chunk in tqdm(to_loop_over, total=len(to_loop_over)):
                predictions_dict = model_api.predict_all(
                    df=scoring_df.iloc[chunk:chunk + 400],
                    bucket="retrofit-data-dev",
                    prediction_buckets={
                        "sap_change_predictions": "retrofit-sap-predictions-dev",
                    }
                )

                predictions.append(predictions_dict["sap_change_predictions"])

            predictions = pd.concat(predictions)
            predictions_size = predictions.shape[0]

            predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
                results_df[["row_id", "sap"]], how="left", on="row_id"
            )
            if predictions.shape[0] != predictions_size:
                raise ValueError("Predictions size has changed")
            predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]

            results_df = results_df.merge(
                predictions[["sap_uplift", "row_id"]],
                how="left",
                on="row_id"
            )
            results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]

            eligibility_assessment = []
            for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
                # The upgrade requirements are dependent on the current SAP

                # If the property is an F or G, it only needs to upgrade to an %
                if row["sap"] <= 38:
                    if row["post_install_sap"] >= 57:
                        eligibility_classification = "highest confidence"
                    elif row["post_install_sap"] >= 55:
                        eligibility_classification = "high confidence"
                    elif row["post_install_sap"] >= 53:
                        eligibility_classification = "medium confidence"
                    else:
                        eligibility_classification = "unlikely"
                else:

                    if row["post_install_sap"] >= 71:
                        eligibility_classification = "highest confidence"
                    elif row["post_install_sap"] >= 69:
                        eligibility_classification = "high confidence"
                    elif row["post_install_sap"] >= 67:
                        eligibility_classification = "medium confidence"
                    else:
                        eligibility_classification = "unlikely"

                eligibility_assessment.append(
                    {
                        "row_id": row["row_id"],
                        "eligibility_classification": eligibility_classification
                    }
                )

            eligibility_assessment = pd.DataFrame(eligibility_assessment)

            # Make sure the results haven't changed in size
            results_df = results_df.merge(
                eligibility_assessment, how="left", on="row_id"
            )
            if results_df.shape[0] != len(results):
                raise ValueError("results has changed size")

        # We store the results in S3 as a pickle
        save_pickle_to_s3(
            data={
                "results_df": results_df,
                "scoring_df": scoring_df,
                "nodata": nodata
            },
            bucket_name="retrofit-datalake-dev",
            s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
        )

        outputs[ha_name] = {
            "results_df": results_df,
            "scoring_df": scoring_df,
            "nodata": nodata
        }

    return outputs


def get_col_widths(dataframe):
    # Define a maximum width for any column to prevent excessively wide columns
    max_allowed_width = 25

    # Calculate widths for columns
    widths = []

    if isinstance(dataframe.columns, pd.MultiIndex):
        # For MultiIndex, calculate max width considering the header and data
        header_widths = [max(len(str(item)) for item in col) + 2 for col in dataframe.columns.values]  # +2 for padding
        for i, column in enumerate(dataframe.columns):
            max_data_width = max(dataframe[column].astype(str).apply(len).max(), header_widths[i])
            widths.append(min(max_data_width, max_allowed_width))
    else:
        # For non-MultiIndex, calculate width normally
        for col in dataframe.columns:
            # Calculate the max length of data or column name and limit it
            max_length = max(dataframe[col].astype(str).apply(len).max(), len(str(col)) + 2)  # +2 for padding
            widths.append(min(max_length, max_allowed_width))

    return widths


# def analyse_ha_data(outputs, loader):
#     """
#     The approach we take within this function is the following:
#     For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The
#     characterisation can be broken down as the following:
#     1) The property has been identified by Warmfront and is eligible for ECO4/GBIS work, under the strictest criteria
#     2) The property has been identified by Warmfront, however it has a full cavity, and therefore would be subject to
#     a CIGA check
#     3) The property has been identified by Warmfront, but the EPC shows that the property has more than 100mm loft
#     insulation
#     4) The property has been identified by Warmfront, but doesn't look like a property that would likely qualify under
#     any cirsumstances, given the available data
#
#     Then, for any property that has NOT been identifid by Warmfront, we identify properties that look like they would
#     qualify under the strictest criteria, and mark these as potential additional opportunities.
#
#     :return:
#     """
#
#     eco4_rate = 1710
#     gbis_rate = 600
#     # old_eco4_rate = 1456
#     old_gbis_rate = 432
#
#     epc_c_threshold = 80
#     scheme_map = {
#         "ECO4": "ECO4",
#         "AFFORDABLE WARMTH": "ECO4",
#         "ECO4 A/W": "ECO4",
#         "ECO4 GBIS (ECO+)": "GBIS"
#     }
#
#     ha_analysis_results = []
#     total_revenue_results = []
#     for ha_name, datasets in outputs.items():
#         inputs = [x for k, x in loader.data.items() if k == ha_name][0]
#
#         results_df = datasets["results_df"].copy()
#
#         analysis_data = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility"]].rename(
#             columns={"row_meaning": "asset_identification_status"}
#         ).merge(
#             results_df,
#             how="left",
#             right_on="row_id",
#             left_on="asset_list_row_id"
#         )
#
#         analysis_data["is_remaining"] = True
#
#         n_sold_eco4 = 0
#         n_sold_gbis = 0
#         if not inputs["survey_list"].empty:
#             # Merge on the survey list and signal everything that is remaining or not (i.e. anything that hasn't had
#             # a survey)
#             survey_list = inputs["survey_list"].copy()
#
#             # TODO: TEMP
#             scheme_column = survey_list.columns[0]
#             # We clean up the survey list installation or cancelled
#             survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower()
#             # Remove all punctuation
#             survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
#                 r'[^\w\s]', '', regex=True
#             )
#             # Remove double spaces
#             survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
#                 r'\s+', ' ', regex=True
#             )
#             # Remove trailing spaces
#             survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip()
#
#             # Remap the values in the scheme column
#             survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map)
#
#             survey_list["installation_status"] = None
#             survey_list["installation_status"] = np.where(
#                 survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]),
#                 "installed",
#                 survey_list["installation_status"]
#             )
#             survey_list["installation_status"] = np.where(
#                 survey_list["installed_or_cancelled_clean"].isin(["cancelled"]),
#                 "cancelled",
#                 survey_list["installation_status"]
#             )
#             # Find partial installations
#             survey_list["installation_status"] = np.where(
#                 survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"),
#                 "partially installed",
#                 survey_list["installation_status"]
#             )
#             # Find partial cancellations
#             # TODO: We might have more indications of partial cancellations
#             survey_list["installation_status"] = np.where(
#                 survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]),
#                 "partially cancelled",
#                 survey_list["installation_status"]
#             )
#
#             # Finally, for other cases, we set the status to "in progress"
#             survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress")
#
#             # We concatenate the scheme name with the installation status
#             survey_list["installation_status"] = (
#                 survey_list[scheme_column] + " - " + survey_list["installation_status"]
#             )
#
#             # TODO: END TEMP
#
#             survey_list_to_merge = survey_list[["asset_list_row_id", scheme_column]].copy()
#             survey_list_to_merge["is_remaining"] = False
#             analysis_data = analysis_data.drop(columns="is_remaining").merge(
#                 survey_list_to_merge,
#                 how="left", on="asset_list_row_id"
#             )
#             analysis_data["is_remaining"] = analysis_data["is_remaining"].fillna(True)
#
#             n_sold_eco4 = survey_list_to_merge[survey_list_to_merge[scheme_column] == "ECO4"].shape[0]
#             n_sold_gbis = survey_list_to_merge[survey_list_to_merge[scheme_column] == "GBIS"].shape[0]
#
#         # Take just remaining
#         analysis_data = analysis_data[analysis_data["is_remaining"]]
#
#         # Also, if the HA has started selling, we remove any that are still subject to ciga
#         n_eco4_missed_subject_to_ciga = 0
#         if not inputs["survey_list"].empty:
#             n_eco4_missed_subject_to_ciga = (analysis_data["ECO Eligibility"] == "eco4 (subject to ciga)").sum()
#             analysis_data = analysis_data[analysis_data["ECO Eligibility"] != "eco4 (subject to ciga)"]
#
#         ################################################################################################
#         # We take the properties that strictly qualified under eco
#         ################################################################################################
#
#         eco4_identified = analysis_data[analysis_data["ECO Eligibility"] == "eco4"].copy()
#         eco4_identified["identification_type"] = None
#         eco4_identified["identification_type"] = np.where(
#             (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == True),
#             "strict",
#             eco4_identified["identification_type"]
#         )
#
#         # For expansive, the property can be no higher than an EPC C
#         eco4_identified["identification_type"] = np.where(
#             (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False) & (
#                 eco4_identified["sap"] <= epc_c_threshold
#             ),
#             "expansive",
#             eco4_identified["identification_type"]
#         )
#         ################################################################################################
#         # We take the properties dependent on CIGA
#         ################################################################################################
#
#         ciga_dependent_identified = analysis_data[
#             analysis_data["ECO Eligibility"].isin(
#                 [
#                     "eco4 (subject to ciga)",
#                     "eco4 - passed ciga"
#                 ]
#             )
#         ].copy()
#
#         # These are properties that show filled cavity
#         ciga_dependent_identified["identification_type"] = None
#         ciga_dependent_identified["identification_type"] = np.where(
#             ciga_dependent_identified["eco4_message"].isin(
#                 [
#                     "Perfect suitability",
#                     "Meets cavity and sap",
#                     "Fails cavity, meets loft, fails SAP",
#                     "Meets fabric, fails SAP check",
#                     "Meets cavity, loft borderline, meets sap",
#                 ]
#             ) & (ciga_dependent_identified["sap"] <= epc_c_threshold),
#             "strict",
#             ciga_dependent_identified["identification_type"]
#         )
#
#         ciga_dependent_identified["identification_type"] = np.where(
#             ((ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | (
#                 ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"])
#             )) & (
#                 (ciga_dependent_identified["sap"] <= epc_c_threshold) &
#                 pd.isnull(ciga_dependent_identified["identification_type"])
#             ),
#             "expansive",
#             ciga_dependent_identified["identification_type"]
#         )
#
#         ################################################################################################
#         # We properties that qualified for gbis
#         ################################################################################################
#         gbis_identified = analysis_data[analysis_data["ECO Eligibility"] == "gbis"].copy()
#         gbis_identified["identification_type"] = None
#         gbis_identified["identification_type"] = np.where(
#             (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] < 69),
#             "strict",
#             gbis_identified["identification_type"]
#         )
#
#         gbis_identified["identification_type"] = np.where(
#             (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] <= epc_c_threshold) & (
#                 pd.isnull(gbis_identified["identification_type"])
#             ),
#             "expansive",
#             gbis_identified["identification_type"]
#         )
#
#         # Finally, we look at the properties that have not been identified by Warmfront
#         not_identified = analysis_data[
#             analysis_data["ECO Eligibility"].isin(
#                 [
#                     "not eligible"
#                 ]
#             )
#         ].copy()
#
#         surplus_eco4 = not_identified[
#             (not_identified["eco4_eligible"] == True) & (not_identified["eco4_message"].isin(
#                 ["Perfect suitability", "Meets cavity, loft borderline, meets sap", "Near perfect suitability"]
#             ))
#             ]
#
#         surplus_gbis = not_identified[
#             (not_identified["gbis_eligible"] == True) & (
#                 ~not_identified["asset_list_row_id"].isin(surplus_eco4["asset_list_row_id"].values)
#             ) & (not_identified["sap"] < 69) & (
#                 (not_identified["cavity_type"].isin(["empty", "partial insulation"])) | (
#                 not_identified["walls"].str.contains("partial", case=False, na=False)
#             )
#             )
#             ]
#         surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False]
#
#         # Output variables - the data was sent to us in December, but the remaining figures are
#         # what was in November
#         november_remaining = loader.december_figures[loader.december_figures["HA Name"] == ha_name]
#
#         # ECO4
#         n_properties_remaining_in_asset_list = inputs["asset_list"].shape[0]
#         november_eco4_remaining = max(november_remaining["ECO4 remaining"].values[0], 0)
#         november_eco4_sold = november_remaining["No. of Tech surveys complete - Eco 4"].values[0]
#         eco4_sales_since_november = n_sold_eco4 - november_eco4_sold
#
#         n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0]
#         eco4_of_which_identified_strict = (
#             eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] +
#             ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "strict"].shape[0]
#         )
#         eco4_of_which_identified_expansive = (
#             eco4_identified[eco4_identified["identification_type"] == "expansive"].shape[0] +
#             ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "expansive"].shape[0]
#         )
#         # GBIS
#         n_warmfront_identified_gbis = gbis_identified.shape[0]
#         november_gbis_remaining = max(november_remaining["GBIS remaining"].values[0], 0)
#         november_gbis_sold = november_remaining["No. of Tech surveys complete - GBIS"].values[0]
#         gbis_sales_since_november = n_sold_gbis - november_gbis_sold
#         gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0]
#         gbis_of_which_identified_expansive = \
#             gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0]
#
#         to_append = {
#             ("", "HA Name"): ha_name,
#             ("", "# properties in asset list"): n_properties_remaining_in_asset_list,
#             ############
#             # ECO4
#             ############
#             ("ECO4", "# remaining November file"): november_eco4_remaining,
#             ("ECO4", "# sold in November file"): november_eco4_sold,
#             ("ECO4", "# sold (survey list)"): n_sold_eco4,
#             ("ECO4", "# that missed CIGA check"): n_eco4_missed_subject_to_ciga,
#             ("ECO4", "# Remaining properties (asset list)"): n_warmfront_identified_eco4,
#             ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict,
#             ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive,
#             ("ECO4", "Of which identified by model - total"): (
#                 eco4_of_which_identified_strict + eco4_of_which_identified_expansive
#             ),
#             ("ECO4", "Additional properties"): surplus_eco4.shape[0],
#             ############
#             # GBIS
#             ############
#             ("GBIS", "# remaining November file"): november_gbis_remaining,
#             ("GBIS", "# sold in November file"): november_gbis_sold,
#             ("GBIS", "# sold (survey list)"): n_sold_gbis,
#             ("GBIS", "# Remaining properties (asset list)"): n_warmfront_identified_gbis,
#             ("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict,
#             ("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive,
#             ("GBIS", "Of which identified by model - total"): (
#                 gbis_of_which_identified_strict + gbis_of_which_identified_expansive
#             ),
#             ("GBIS", "Additional properties"): surplus_gbis.shape[0]
#         }
#
#         ha_analysis_results.append(to_append)
#
#         # Calculate the revenue results
#         to_append_revenue = {
#             ("", "HA Name"): ha_name,
#             # Eco4 revenue
#             ("ECO4", "£ remaining November file"): november_eco4_remaining * eco4_rate,
#             ("ECO4", "£ sold November file"): november_eco4_sold * old_eco4_rate,
#             ("ECO4", "£ sold since November"): eco4_sales_since_november * eco4_rate,
#             ("ECO4", "£ stuck at ciga check"): n_eco4_missed_subject_to_ciga * eco4_rate,
#             ("ECO4", "£ remaining (asset list)"): n_warmfront_identified_eco4 * eco4_rate,
#             ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict * eco4_rate,
#             ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive * eco4_rate,
#             ("ECO4", "Of which identified by model - total"): eco4_rate * (
#                 eco4_of_which_identified_strict + eco4_of_which_identified_expansive
#             ),
#             ("ECO4", "Additional properties"): eco4_rate * surplus_eco4.shape[0],
#         }
#         total_revenue_results.append(to_append_revenue)
#
#     ha_analysis_results = pd.DataFrame(ha_analysis_results)
#     ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns)
#
#     facts_and_figures = loader.facts_and_figures.copy()
#     facts_and_figures["ha_number"] = facts_and_figures["HA Name"].str.extract(r'(\d+)').astype(int)
#     facts_and_figures = facts_and_figures.sort_values("ha_number")
#     facts_and_figures = facts_and_figures.drop(columns=["ha_number"])
#
#     # Rename some of the cols
#     facts_and_figures = facts_and_figures.rename(
#         columns={
#             # ECO4 cols
#             "ECO4": "ECO4 - November",
#             "GBIS": "GBIS - November",
#             "eco4 (subject to ciga)": "ECO4 - subject to ciga",
#             "eco4": "ECO4 - doesn't need CIGA",
#             "eco4 - passed ciga": "ECO4 - passed CIGA",
#             "failed ciga": "ECO4 - failed CIGA",
#             "ECO4 - partially cancelled": "ECO4 - Install downgrade to GBIS",
#             "ECO4 - in progress": "ECO4 - Install in progress",
#             "ECO4 - cancelled": "ECO4 - Install cancelled",
#             # GBIS cols
#             "gbis": "GBIS total (asset list)"
#         }
#     )
#     # We calculate the eco4 total from the asset list
#     # 1) If ciga checks have been completed (i.e. ECO4 - passed ciga > 0) this sum is
#     # ECO4 - doesn't need CIGA + ECO4 - passed CIGA
#     # 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is
#     # ECO4 - doesn't need CIGA + ECO4 - subject to ciga
#     facts_and_figures["ECO4 total (asset list - pre ciga)"] = (
#         facts_and_figures["ECO4 - doesn't need CIGA"] +
#         facts_and_figures["ECO4 - subject to ciga"] +
#         facts_and_figures["ECO4 - passed CIGA"]
#     )
#
#     facts_and_figures["ECO4 total (asset list - post ciga)"] = None
#     facts_and_figures["ECO4 total (asset list - post ciga)"] = np.where(
#         facts_and_figures["ECO4 - passed CIGA"] > 0,
#         facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"],
#         facts_and_figures["ECO4 total (asset list - post ciga)"]
#     )
#
#     # Re-arrange the columns
#     facts_and_figures = facts_and_figures[
#         [
#             'HA Name',
#             'ECO4 - November',
#             'GBIS - November',
#             'ECO4 total (asset list - pre ciga)',
#             'ECO4 total (asset list - post ciga)',
#             'GBIS total (asset list)',
#             'ECO4 - subject to ciga',
#             "ECO4 - doesn't need CIGA",
#             'ECO4 - passed CIGA',
#             'ECO4 - failed CIGA',
#             'ECO4 - installed',
#             'ECO4 - Install in progress',
#             'ECO4 - Install cancelled',
#             'ECO4 - partially installed',
#             'ECO4 - Install downgrade to GBIS',
#         ]
#     ]
#     # Addd a note to flag any rows where ECO4 (
#     # subject to ciga is greater than 0) and (ECO4 - passed ciga is greater than 0
#     # )
#     facts_and_figures["Missed CIGA checks opportunity"] = None
#     facts_and_figures["Missed CIGA checks opportunity"] = np.where(
#         (facts_and_figures["ECO4 - subject to ciga"] > 0) & (facts_and_figures["ECO4 - passed CIGA"] > 0),
#         "potential opportunity of " + facts_and_figures["ECO4 - subject to ciga"].astype(
#             str) + " ECO4 properties needing a CIGA check",
#         facts_and_figures["Missed CIGA checks opportunity"]
#     )
#
#     facts_and_figures.to_csv("Facts and figures sample.csv")
#
#     # Re arrage the columns
#
#     # Also sort ha_analysis_results by ha number
#     ha_analysis_results["ha_number"] = ha_analysis_results[("", "HA Name")].str.extract(r'(\d+)').astype(int)
#     ha_analysis_results = ha_analysis_results.sort_values("ha_number")
#     ha_analysis_results = ha_analysis_results.drop(columns=["ha_number"])
#
#     # We save 2 sheets
#     # Automate creation of the excel
#     # Create a Pandas Excel writer using XlsxWriter as the engine
#     with pd.ExcelWriter('HA Analysis Results.xlsx', engine='xlsxwriter') as writer:
#         # Write each dataframe to a different worksheet without the index
#         for df, sheet in [(facts_and_figures, 'HA Facts and Figures'),
#                           (ha_analysis_results, 'Asset Identification')]:
#
#             df.to_excel(writer, sheet_name=sheet)
#
#             # Auto-adjust columns' width
#             for i, width in enumerate(get_col_widths(df)):
#                 writer.sheets[sheet].set_column(i, i, width)
#
#     # Inspection: - Looking into the proportion of homes with "cavity, as built, insulated (assumed)" as their
#     #               description, and what proportion of time they get identified via non-invasive surveys
#
#     # true_eco4_assets = []
#     # ciga_dependent_assets = []
#     # not_eligible = []
#     # as_built_insulated = []
#     # date_cols = {
#     #     "HA39": "date_built",
#     #     "HA14": "Built In Year",
#     #     "HA6": "Construction Year",
#     #     "HA1": "Build Date",
#     #     "HA107": "YEAR BUILT"
#     # }
#     # for ha_name, data_objects in outputs.items():
#     #     inputs = [x for k, x in loader.data.items() if k == ha_name][0]
#     #
#     #     date_col = date_cols[ha_name]
#     #     results_df = data_objects["results_df"].copy()
#     #     df = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility", date_col]].rename(
#     #         columns={"row_meaning": "asset_identification_status", date_col: "date_built"}
#     #     ).merge(
#     #         results_df,
#     #         how="left",
#     #         right_on="row_id",
#     #         left_on="asset_list_row_id"
#     #     )
#     #
#     #     # take the true ECO4
#     #     true_eco4 = df[df["ECO Eligibility"] == "eco4"].copy()
#     #     ciga_dependent = df[
#     #         df["ECO Eligibility"].isin(
#     #             [
#     #                 "eco4 (subject to ciga)",
#     #                 "failed ciga",
#     #                 "eco4 - passed ciga"
#     #             ]
#     #         )
#     #     ]
#     #     insulated_assumed = df[df["walls"] == "Cavity wall, as built, insulated"].copy()
#     #     # We convert date built to datetime
#     #     try:
#     #         insulated_assumed = insulated_assumed[~pd.isnull(insulated_assumed["date_built"])]
#     #         insulated_assumed["year_built"] = pd.to_datetime(insulated_assumed["date_built"].astype(str)).dt.year
#     #         as_built_insulated.append(insulated_assumed)
#     #     except Exception as e:
#     #         print("oh well")
#     #
#     #     true_eco4_assets.append(true_eco4)
#     #     ciga_dependent_assets.append(ciga_dependent)
#     #
#     # true_eco4_assets = pd.concat(true_eco4_assets)
#     # ciga_dependent_assets = pd.concat(ciga_dependent_assets)
#     # as_built_insulated = pd.concat(as_built_insulated)
#     #
#     # true_eco4_assets["walls"].value_counts(normalize=True)
#     # ciga_dependent_assets["walls"].value_counts(normalize=True)
#     #
#     # from recommendations.recommendation_utils import extract_insulation_thickness
#     #
#     # true_eco4_assets["roof_insulation_thickness"] = true_eco4_assets["roof"].apply(
#     #     lambda x: extract_insulation_thickness(x)
#     # )
#     #
#     # true_eco4_assets["e"] = true_eco4_assets.merge(
#     #     pd.DataFrame(cleaned["roof-description"])[["original_description", "insulation_thickness"]],
#     #     how="left",
#     #     left_on="roof",
#     #     right_on="original_description"
#     # )
#     #
#     # true_eco4_assets["sap"].mean()
#     #
#     # true_eco4_assets["insulation_thickness"].isin(
#     #     ["250", "150", "200", "100", "75", "50"]
#     # ).sum() / true_eco4_assets.shape[0]
#     #
#     # true_eco4_assets["insulation_thickness"].isin(
#     #     ["100"]
#     # ).sum() / true_eco4_assets.shape[0]
#     #
#     # as_built_insulated.groupby("property_type")["ECO Eligibility"].value_counts(normalize=True)


def get_propensity_model_data(
    loader, cleaned, cleaning_data, created_at, photo_supply_lookup,
    floor_area_decile_thresholds, pull_data=True
):
    # TODO: Set a seed!
    model_data = []
    for ha_name, data_assets in loader.data.items():

        logger.info("Processing HA: %s", ha_name)
        if data_assets["survey_list"].empty:
            continue

        number_sold = data_assets["survey_list"].shape[0]

        # For each HA, we read pull in the data required, and store in S3
        asset_list = data_assets["asset_list"].copy()
        # We determine the number of properties that we should select that are eligible
        asset_list_size = asset_list.shape[0]
        # Number eligible
        n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0]
        success_rate = n_eligibile / asset_list_size
        needed_sample_size = np.ceil(number_sold / success_rate)
        number_negative_samples = int(needed_sample_size - number_sold)

        sold_asset_list_ids = data_assets["survey_list"]["asset_list_row_id"].tolist()
        negative_sample_asset_list_ids = asset_list["asset_list_row_id"].sample(number_negative_samples).tolist()
        sample_ids = sold_asset_list_ids + negative_sample_asset_list_ids

        sample_asset_list = asset_list[asset_list["asset_list_row_id"].isin(sample_ids)]

        # In order to have the most confidence, we should take just properties that have 1 EPC. We might need to
        # cut down the number of properties that we include because of this
        # Note: This is an imbalanced problem so we will need to build a model accomadating of that

        data = []
        errors = []
        for index, property_meta in tqdm(sample_asset_list.iterrows(), total=len(sample_asset_list)):

            if property_meta["matching_postcode"] is None:
                continue

            property_type, built_form = get_property_type_and_built_form(
                property_meta=property_meta, ha_name=ha_name
            )

            searcher = SearchEpc(
                address1=str(property_meta["HouseNo"]),
                postcode=property_meta["matching_postcode"],
                auth_token=EPC_AUTH_TOKEN,
                os_api_key="",
                full_address=property_meta["matching_address"]
            )
            searcher.ordnance_survey_client.property_type = property_type
            searcher.ordnance_survey_client.built_form = built_form
            searcher.find_property(skip_os=True)

            if searcher.newest_epc is None:
                continue

            if searcher.newest_epc.get("estimated"):
                # We insert the row ID as our proxy for UPRN
                searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])

            newest_epc = searcher.newest_epc
            older_epcs = searcher.older_epcs
            full_sap_epc = searcher.full_sap_epc

            # If we have more than 1 EPC for the moment we just continue
            if older_epcs or full_sap_epc:
                continue
            try:

                # We clean up the data
                epc_records = {
                    'original_epc': newest_epc.copy(),
                    'full_sap_epc': full_sap_epc.copy(),
                    'old_data': older_epcs.copy(),
                }

                epc_record = EPCRecord(
                    epc_records=epc_records,
                    run_mode="newdata",
                    cleaning_data=cleaning_data
                )

                # If we have some data, continue
                data.append(
                    {
                        "ECO Eligibility": property_meta["ECO Eligibility"],
                        "asset_list_row_id": property_meta["asset_list_row_id"],
                        **epc_record.get("prepared_epc")
                    }
                )
            except Exception as e:
                errors.append(
                    {
                        "error": str(e),
                        "asset_list_row_id": property_meta["asset_list_row_id"],
                        "matching_postcode": property_meta["matching_postcode"],
                        "matching_address": property_meta["matching_address"]
                    }
                )

        data = pd.DataFrame(data)
        # We store the results in S3 as a pickle
        save_pickle_to_s3(
            data=data,
            bucket_name="retrofit-datalake-dev",
            s3_file_name=f"propensity_model_data/{ha_name}/train.pickle"
        )

        # Store the errors
        if errors:
            save_pickle_to_s3(
                data=errors,
                bucket_name="retrofit-datalake-dev",
                s3_file_name=f"propensity_model_data/{ha_name}/errors.pickle"
            )

        model_data.append(data)

    return model_data


def conversion_model(loader):
    # Read in the model data

    model_data = []
    for ha_name in loader.data.keys():
        try:
            picked = read_pickle_from_s3(
                bucket_name="retrofit-datalake-dev",
                s3_file_name=f"propensity_model_data/{ha_name}/train.pickle"
            )
            data = pd.DataFrame(picked)

            # We merge on the sales data
            sales_data = loader.data[ha_name]["survey_list"].copy()
            data = data.merge(
                sales_data[["asset_list_row_id", "installation_status"]],
                how="left",
                on="asset_list_row_id"
            )
            data["ha_name"] = ha_name

        except Exception as e:
            logger.error("Error reading in the data for %s", ha_name)
            continue

        model_data.append(data)

    model_data = pd.concat(model_data)

    model_data["response"] = model_data["installation_status"].isin(
        [
            "ECO4 - in progress",
            "ECO4 - installed"
        ]
    ).astype(int)

    # Because of how we pulled the data, we need to re-balance the sample
    ha_names = model_data["ha_name"].unique()

    balanced_sample = []
    for ha_name in ha_names:
        df = model_data[model_data["ha_name"] == ha_name]
        positive_samples = df[df["response"] == 1]
        negative_samples = df[df["response"] != 1]

        inputs = [x for k, x in loader.data.items() if k == ha_name][0]
        asset_list = inputs["asset_list"].copy()
        asset_list_size = asset_list.shape[0]
        n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0]
        success_rate = n_eligibile / asset_list_size
        needed_sample_size = np.ceil(positive_samples.shape[0] / success_rate)
        number_negative_samples = int(needed_sample_size - positive_samples.shape[0])
        negative_samples_subset = negative_samples.sample(number_negative_samples)

        output = pd.concat([positive_samples, negative_samples_subset])

        balanced_sample.append(output)

    balanced_sample = pd.concat(balanced_sample)

    # We work with a small sample
    # Drop the ECO Eligibility column and installation_status column
    # We keep the ID column
    balanced_sample = balanced_sample.drop(
        columns=['ECO Eligibility', 'asset_list_row_id', 'address', 'uprn_source', 'address3', 'local_authority_label',
                 'county', 'postcode', 'constituency', 'local_authority', 'inspection_date', 'address1',
                 'constituency_label', 'building_reference_number', 'address2', 'posttown', 'lodgement_datetime',
                 'uprn', 'lodgement_date', 'lmk_key', 'installation_status', 'ha_name']
    )

    # POC model
    df = balanced_sample.copy()
    # FIll missings with means, if they exist
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    df[categorical_cols] = df[categorical_cols].fillna("other")

    # Reduce the number of categories to a specific number and the rest to other
    max_n_categories = 10
    for col in categorical_cols:
        top_categories = df[col].value_counts().nlargest(max_n_categories).index
        df[col] = df[col].where(df[col].isin(top_categories), other="other")

    # Use a model based approach to feature selection
    import xgboost as xgb
    from sklearn.model_selection import train_test_split

    # Assuming your outcome column is named 'target'
    X = df.drop(columns=['response'])
    y = df['response']
    df["low_energy_fixed_light_count"].va

    # Encoding categorical variables if not already done
    X = pd.get_dummies(X, drop_first=True)

    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize an XGBoost classifier
    model = xgb.XGBClassifier()

    # Fit the model
    model.fit(X_train, y_train)

    # Get feature importances
    feature_importances = model.feature_importances_

    # Map feature importances to their corresponding column names
    feature_importance_dict = {feature: importance for feature, importance in zip(X.columns, feature_importances)}

    # Sort features by importance
    sorted_features = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)

    # Display sorted features
    for feature, importance in sorted_features:
        print(f"{feature}: {importance}")


def patch_cleaned(cleaned):
    # Patch to handle the a missing description
    cleaned["floor-description"].extend(
        [
            {'original_description': 'To external air, uninsulated (assumed)',
             'clean_description': 'To external air, no insulation', 'thermal_transmittance': None,
             'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': False,
             'is_to_external_air': True, 'is_suspended': False, 'is_solid': False, 'another_property_below': False,
             'insulation_thickness': 'none'},
            {'original_description': 'To unheated space, uninsulated (assumed)',
             'clean_description': 'To unheated space, uninsulated', 'thermal_transmittance': None,
             'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': True,
             'is_to_external_air': False, 'is_suspended': False, 'is_solid': False, 'another_property_below': False,
             'insulation_thickness': 'average'}
        ]
    )

    cleaned["roof-description"].extend(
        [
            {'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation',
             'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True,
             'is_roof_room': False,
             'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
             'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'}
        ]
    )

    cleaned["roof-description"].extend(
        [
            {'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation',
             'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True,
             'is_roof_room': False,
             'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
             'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'}
        ]
    )

    cleaned["roof-description"].extend(
        [
            {'original_description': 'Pitched, 300+mm loft insulation',
             'clean_description': 'Pitched, 300+ mm loft insulation', 'thermal_transmittance': None,
             'thermal_transmittance_unit': None, 'is_pitched': True, 'is_roof_room': False, 'is_loft': True,
             'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
             'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '300+'
             }
        ]
    )

    thermal_transmittance_values = list(np.arange(0, 2, 0.01))
    for ttv in thermal_transmittance_values:
        ttv_roundeded = round(ttv, 2)
        # We look for an instance of that thermal transmittance value
        rec = [
            x for x in cleaned["roof-description"] if
            (x["thermal_transmittance"] == ttv_roundeded) and "Average thermal transmittance" in x["clean_description"]
        ]

        if rec:
            continue
        else:
            # We patch the record
            cleaned["roof-description"].extend(
                [{'original_description': f'Average thermal transmittance {ttv_roundeded} W/m-¦K',
                  'clean_description': f'Average thermal transmittance {ttv_roundeded} w/m-¦k',
                  'thermal_transmittance': ttv_roundeded,
                  'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False,
                  'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
                  'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}]
            )

    # We also patch a funny unit value we found
    for ttv in thermal_transmittance_values:
        ttv_rounded = round(ttv, 2)
        # We look for an instance of that thermal transmittance value
        rec = [
            x for x in cleaned["roof-description"] if
            (x["thermal_transmittance"] == ttv_rounded) and "Average thermal transmittance" in x["clean_description"]
            and x["thermal_transmittance_unit"] == "w/m?K"
        ]

        if rec:
            continue
        else:
            # We patch the record
            ttv_string = str(ttv_rounded)
            if len(ttv_string) == 3:
                ttv_string = f"{ttv_string}0"

            cleaned["roof-description"].extend(
                [{'original_description': f'Average thermal transmittance {ttv_string} W/m?K',
                  'clean_description': f'Average thermal transmittance {ttv_string} w/m-¦k',
                  'thermal_transmittance': ttv_rounded,
                  'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False,
                  'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
                  'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}]
            )

    # Patch mainheatcont-description
    cleaned["mainheatcont-description"].extend(
        [
            {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None,
             'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None,
             'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None,
             'rate_control': None}
        ]
    )

    # We patch this record because there is another property below
    for x in cleaned["floor-description"]:
        if x["original_description"] == '(Same dwelling below) insulated (assumed)':
            x["another_property_below"] = True
            x["thermal_transmittance"] = 0

    return cleaned


def calculate_eco4_post_ciga(
    eligiblity_counts, input_data, ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate,
    eco4_rate, archetype_conversion_rate
):
    remaining_needing_ciga_check = eligiblity_counts[
        eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") &
        ~eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype")
        ]["count"].sum()

    remaining_needing_ciga_and_archetype_check = eligiblity_counts[
        eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") &
        eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype")
        ]["count"].sum()
    # We scale this down by the archetype_conversion_rate, and add this on to the remaining_needing_ciga_check
    remaining_needing_ciga_and_archetype_check_passed = np.round(
        remaining_needing_ciga_and_archetype_check * archetype_conversion_rate
    )

    remaining_needing_ciga_check += remaining_needing_ciga_and_archetype_check_passed

    eco4_no_ciga_needed = eligiblity_counts[
        eligiblity_counts["ECO Eligibility"] == "eco4"
        ]["count"].sum()

    eco4_no_ciga_archetype_needed = eligiblity_counts[
        eligiblity_counts["ECO Eligibility"] == "eco4 (subject to archetype)"
        ]["count"].sum()
    eco4_no_ciga_archetype_needed_passed = np.round(
        eco4_no_ciga_archetype_needed * archetype_conversion_rate
    )

    eco4_no_ciga_needed += eco4_no_ciga_archetype_needed_passed

    failed_archetype_check = int(
        remaining_needing_ciga_and_archetype_check +
        eco4_no_ciga_archetype_needed -
        remaining_needing_ciga_and_archetype_check_passed -
        eco4_no_ciga_archetype_needed_passed
    )

    has_ciga_check = not input_data["ciga_list"].empty
    if has_ciga_check:

        eco4_ciga_passed = eligiblity_counts[
            eligiblity_counts["ECO Eligibility"] == "eco4 - passed ciga"
            ]["count"].sum()

        eco4_confirmed_ciga_failures = eligiblity_counts[
            eligiblity_counts["ECO Eligibility"] == "failed ciga"
            ]["count"].sum()

        eco4_no_ciga_needed_or_ciga_passed = eco4_no_ciga_needed + eco4_ciga_passed

        eco4_confirmed = np.round(
            (eco4_no_ciga_needed * ha_eco4_to_sale_rate) +
            (eco4_ciga_passed * ha_ciga_pass_to_sale_rate)
        )

        eco4_no_ciga_needed_cancellations = int(eco4_no_ciga_needed_or_ciga_passed - eco4_confirmed)

        if remaining_needing_ciga_check > 0:
            # We update the eco4 post ciga with the converted remaining
            eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate)

            eco4_remaining_forecast = np.round(
                eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate
            )
            eco4_ciga_needed_cancellations = eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast
            eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass
            eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast
        else:
            eco4_remaining_forecast = 0
            eco4_estimated_ciga_failures = 0
            eco4_ciga_needed_cancellations = 0
            eco4_post_ciga = eco4_confirmed

        eco4_expected_cancellations = eco4_no_ciga_needed_cancellations + eco4_ciga_needed_cancellations
    else:
        eco4_confirmed_ciga_failures = 0
        # Multiply by sale conversion
        eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate)
        eco4_no_ciga_cancellations = int(eco4_no_ciga_needed - eco4_confirmed)
        eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate)
        eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass

        eco4_remaining_forecast = np.round(
            eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate
        )
        eco4_ciga_cancellations = int(eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast)
        eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast

        eco4_expected_cancellations = eco4_no_ciga_cancellations + eco4_ciga_cancellations

    eco4_post_ciga = int(eco4_post_ciga)
    eco4_remaining_forecast = int(eco4_remaining_forecast)
    eco4_confirmed = int(eco4_confirmed)

    results = {
        # Counts
        "ECO4 - post CIGA - #": eco4_post_ciga,
        "Of which confirmed - #": eco4_confirmed,
        "Of which forecast - #": eco4_remaining_forecast,
        # Revenue
        "ECO4 - post CIGA - £": eco4_post_ciga * eco4_rate,
        "Of which confirmed - £": eco4_confirmed * eco4_rate,
        "Of which forecast - £": eco4_remaining_forecast * eco4_rate,
        # Archetype check failures
        "Estimated total - failed archetype check - #": failed_archetype_check,
        "Estimated total - failed archetype check - £": failed_archetype_check * eco4_rate,
        # Ciga failures
        "Estimated total - failed CIGA": int(eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures),
        "Confirmed CIGA failures": eco4_confirmed_ciga_failures,
        "Estimated CIGA failures": int(eco4_estimated_ciga_failures),
        # Ciga failures cost
        "Estimated total - failed CIGA - £": int(
            (eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures) * eco4_rate
        ),
        "Confirmed CIGA failures - £": int(eco4_confirmed_ciga_failures * eco4_rate),
        "Estimated CIGA failures - £": int(eco4_estimated_ciga_failures * eco4_rate),
        # Expected cencellations
        "Expected cancellations - #": eco4_expected_cancellations,
        "Expected cancellations - £": eco4_expected_cancellations * eco4_rate
    }

    return results


def forecast_remaining_sales(loader):
    # Assumptions:
    # We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate
    # and I don't want the numbers to change too much, depenent on the CIGA conversation rate
    maximum_ciga_conversion = 0.75

    # This is a hard limit to the allowed conversion rates to final sale. These are typically very
    # high but there are some anomalies, amongst surveys that are early on
    sales_conversion_lower_bound = 0.8

    gbis_rate = 600
    eco4_rate = 1710

    # Based on ONS https://www.ons.gov.uk/peoplepopulationandcommunity/housing/bulletins/housingenglandandwales
    # /census2021
    # there are 5.7 million terraced properties in the UK, of the 19.3 million houses or bungalows. We therefore apply
    # a 30% discount to homes that are dependent on an archetype check, since around 30% of them will be mid terraced
    # This 30% is slightly harsh but we be conservative
    # Therefore, the archetype check conversion rate is 70%
    archetype_conversion_rate = 0.7

    # 1) Calculate the conversion rate from passed CIGA to actual sale
    converted_ciga_jobs = []
    for ha_name, input_data in loader.data.items():
        asset_list = input_data["asset_list"].copy()
        survey_list = input_data["survey_list"].copy()

        if survey_list.empty:
            continue

        ciga_dependent_assets = asset_list[
            asset_list["ECO Eligibility"] == "eco4 - passed ciga"
            ]

        # These are now the ciga dependent assets at installation
        ciga_dependent_assets_at_installation = ciga_dependent_assets.merge(
            survey_list[["asset_list_row_id", "installation_status"]],
            how="inner",
            on="asset_list_row_id"
        )

        # We then calculate how many get cancelled
        ciga_dependent_assets_sold = ciga_dependent_assets_at_installation[
            ciga_dependent_assets_at_installation["installation_status"].isin(
                [
                    "ECO4 - installed", "ECO4 - in progress"
                ]
            )
        ]

        ciga_dependent_assets_failed = ciga_dependent_assets_at_installation[
            ~ciga_dependent_assets_at_installation["installation_status"].isin(
                [
                    "ECO4 - installed", "ECO4 - in progress"
                ]
            )
        ]

        converted_ciga_jobs.append(
            {
                "HA Name": ha_name,
                "# Ciga dependent at installation": ciga_dependent_assets_at_installation.shape[0],
                "# Ciga dependent successfully installed": ciga_dependent_assets_sold.shape[0],
                "# Ciga dependent failed install": ciga_dependent_assets_failed.shape[0]
            }
        )

    converted_ciga_jobs = pd.DataFrame(converted_ciga_jobs)

    # We calculate a ciga pass to install conversaion rate
    median_ciga_pass_to_install = (
        converted_ciga_jobs["# Ciga dependent successfully installed"].sum() /
        converted_ciga_jobs["# Ciga dependent at installation"].sum()
    )

    # 2) Calculate the conversion rate from CIGA dependent to ciga passed
    ciga_passrates = []
    for ha_name, input_data in loader.data.items():

        # If we don't have a ciga list, we can't do anything
        if input_data["ciga_list"].empty:
            continue

        # 1) Calculate the conversion rate for CIGA to actual sale
        asset_list = input_data["asset_list"].copy()

        ciga_completed_assets = asset_list[
            asset_list["ECO Eligibility"].isin(
                [
                    "eco4 - passed ciga",
                    "failed ciga"
                ]
            )
        ]

        ciga_passed = ciga_completed_assets[
            ciga_completed_assets["ECO Eligibility"].isin(
                [
                    "eco4 - passed ciga"
                ]
            )
        ]

        ciga_passrates.append(
            {
                "Ha Name": ha_name,
                "# CIGA dependent": ciga_completed_assets.shape[0],
                "# CIGA passed": ciga_passed.shape[0],
            }
        )

    ciga_passrates = pd.DataFrame(ciga_passrates)

    median_ciga_success_rate = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum()

    # 3) Calculate the conversion rate of an ECO4 and a GBISjob, that doesn't need ciga, to install
    eco4_ciga_independent_to_install = []
    gbis_to_install = []
    for ha_name, input_data in loader.data.items():
        asset_list = input_data["asset_list"].copy()
        survey_list = input_data["survey_list"].copy()

        if survey_list.empty:
            continue

        # For properties that were identified as a typical ECO4 job, we calculate the number of properties that
        # installed
        # vs cancelled

        typical_eco4 = asset_list[asset_list["ECO Eligibility"] == "eco4"]
        typical_gbis = asset_list[asset_list["ECO Eligibility"] == "gbis"]

        # Merge on the surveys
        typical_eco4_installed = typical_eco4.merge(
            survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id"
        )

        if not typical_eco4_installed.empty:
            typical_eco4_sold = typical_eco4_installed[
                typical_eco4_installed["installation_status"].isin(
                    [
                        "ECO4 - installed", "ECO4 - in progress"
                    ]
                )
            ]

            eco4_ciga_independent_to_install.append(
                {
                    "Ha Name": ha_name,
                    "# ECO4 at install stage": typical_eco4_installed.shape[0],
                    "# ECO4 successfully installed": typical_eco4_sold.shape[0]
                }
            )

        typical_gbis_installed = typical_gbis.merge(
            survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id"
        )
        if not typical_gbis_installed.empty:
            typical_gbis_sold = typical_gbis_installed[
                typical_gbis_installed["installation_status"].isin(
                    [
                        "GBIS - in progress", "GBIS - installed"
                    ]
                )
            ]

            gbis_to_install.append(
                {
                    "Ha Name": ha_name,
                    "# GBIS at install stage": typical_gbis_installed.shape[0],
                    "# GBIS successfully installed": typical_gbis_sold.shape[0]
                }
            )

    eco4_ciga_independent_to_install = pd.DataFrame(eco4_ciga_independent_to_install)
    gbis_to_install = pd.DataFrame(gbis_to_install)

    eco4_ciga_independent_to_install["conversion"] = (
        eco4_ciga_independent_to_install["# ECO4 successfully installed"] /
        eco4_ciga_independent_to_install["# ECO4 at install stage"]
    )
    eco4_ciga_independent_to_install_clipped = eco4_ciga_independent_to_install[
        eco4_ciga_independent_to_install["conversion"] >= sales_conversion_lower_bound
        ]

    gbis_to_install["conversion"] = (
        gbis_to_install["# GBIS successfully installed"] /
        gbis_to_install["# GBIS at install stage"]
    )
    gbis_to_install_clipped = gbis_to_install[
        gbis_to_install["conversion"] >= sales_conversion_lower_bound
        ]

    median_eco4_to_install = (
        eco4_ciga_independent_to_install_clipped["# ECO4 successfully installed"].sum() /
        eco4_ciga_independent_to_install_clipped["# ECO4 at install stage"].sum()
    )

    median_gbis_to_install = (
        gbis_to_install_clipped["# GBIS successfully installed"].sum() /
        gbis_to_install_clipped["# GBIS at install stage"].sum()
    )

    # Produce the final output
    december_figures = loader.december_figures.copy()
    december_figures = december_figures.fillna(0)
    # If we have negative remaining, it means that actually sold more gbis than they initially thought so we set
    # remaining to 0
    december_figures["ECO4 remaining"] = np.where(
        december_figures["ECO4 remaining"] < 0, 0, december_figures["ECO4 remaining"]
    )
    december_figures["GBIS remaining"] = np.where(
        december_figures["GBIS remaining"] < 0, 0, december_figures["GBIS remaining"]
    )

    results = []
    for ha_name, input_data in loader.data.items():

        # Original warmfront figures - ECO4
        original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name]
        if original_warmfront_estimates.empty:
            # Append an empty row
            original_warmfront_estimates = december_figures.head(1).copy()
            for k in original_warmfront_estimates.columns:
                original_warmfront_estimates[k] = 0
            original_warmfront_estimates["HA Name"] = ha_name

        original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0]
        original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0]
        original_warmfront_sold_eco4 = (
            original_warmfront_estimates["No. of Tech surveys complete - Eco 4"].values[0] * eco4_rate
        )

        original_warmfront_eco4_revenue = original_warmfront_eco4 * eco4_rate
        original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate
        original_warmfront_sold_gbis = (
            original_warmfront_estimates["No. of Tech surveys complete - GBIS"].values[0] * gbis_rate
        )

        # Original warmfront figures - GBIS

        original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0]
        original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0]

        original_warmfront_gbis_revenue = (
            original_warmfront_gbis * gbis_rate
        )
        original_warmfront_remaining_gbis_revenue = original_warmfront_remaining_gbis * gbis_rate

        # Asset list - ECO4
        asset_list = input_data["asset_list"].copy()
        survey_list = input_data["survey_list"].copy()

        if survey_list.empty:
            asset_list_remaining = asset_list.copy()
        else:
            # For HA6, there are a small number of postcodes that do not match to any item in the asset list
            survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])]
            asset_list_remaining = asset_list.merge(
                survey_list[["asset_list_row_id", "installation_status"]],
                how="left",
                on="asset_list_row_id"
            )
            # Anything that has an installation has gone to installation, and therefore is not remaining
            asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])]
            asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"])

        eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index()
        eligiblity_counts_remaining = pd.DataFrame(asset_list_remaining["ECO Eligibility"].value_counts()).reset_index()

        eco4_pre_ciga = eligiblity_counts[
            eligiblity_counts["ECO Eligibility"].isin(
                [
                    "eco4",
                    "eco4 (subject to ciga)",
                    "eco4 - passed ciga",
                    "failed ciga",
                    "eco4 (subject to ciga) (subject to archetype)",
                    "eco4 (subject to archetype)"
                ]
            )
        ]["count"].sum()

        eco4_pre_ciga_remaining = eligiblity_counts_remaining[
            eligiblity_counts_remaining["ECO Eligibility"].isin(
                [
                    "eco4",
                    "eco4 (subject to ciga)",
                    "eco4 - passed ciga",
                    "failed ciga",
                    "eco4 (subject to ciga) (subject to archetype)",
                    "eco4 (subject to archetype)"
                ]
            )
        ]["count"].sum()

        eco4_pre_ciga_revenue = eco4_pre_ciga * eco4_rate
        eco4_pre_ciga_remaining_revenue = eco4_pre_ciga_remaining * eco4_rate

        # Total Eligible - this is what passed ciga checks + strict. If we don't have what passed CIGA, we estimate
        # We check if the HA has done a CIGA check. Also, if we have assets dormant at CIGA, we estimate what will
        # convert
        # We estimate a conversion for anything left post CIGA
        ha_ciga_conversion = ciga_passrates[ciga_passrates["Ha Name"] == ha_name]
        if not ha_ciga_conversion.empty:
            ha_ciga_conversion_rate = (
                ha_ciga_conversion["# CIGA passed"].values[0] / ha_ciga_conversion["# CIGA dependent"].values[0]
            )
        else:
            ha_ciga_conversion_rate = (
                median_ciga_success_rate if median_ciga_success_rate <= maximum_ciga_conversion else
                maximum_ciga_conversion
            )

        # We also need the ha ciga passed to install success rate
        ha_ciga_pass_to_sale = converted_ciga_jobs[converted_ciga_jobs["HA Name"] == ha_name]
        if not ha_ciga_pass_to_sale.empty and ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0] != 0:
            ha_ciga_pass_to_sale_rate = (
                ha_ciga_pass_to_sale["# Ciga dependent successfully installed"].values[0] /
                ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0]
            )
        else:
            ha_ciga_pass_to_sale_rate = median_ciga_pass_to_install

        ha_eco4_to_sale = eco4_ciga_independent_to_install_clipped[
            eco4_ciga_independent_to_install_clipped["Ha Name"] == ha_name
            ]
        if not ha_eco4_to_sale.empty:
            ha_eco4_to_sale_rate = (
                ha_eco4_to_sale['# ECO4 successfully installed'].values[0] /
                ha_eco4_to_sale['# ECO4 at install stage'].values[0]
            )
        else:
            ha_eco4_to_sale_rate = median_eco4_to_install

        eco4_post_ciga_total_results = calculate_eco4_post_ciga(
            eligiblity_counts=eligiblity_counts,
            input_data=input_data,
            ha_ciga_conversion_rate=ha_ciga_conversion_rate,
            ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate,
            ha_eco4_to_sale_rate=ha_eco4_to_sale_rate,
            eco4_rate=eco4_rate,
            archetype_conversion_rate=archetype_conversion_rate
        )

        eco4_post_ciga_remaining_results = calculate_eco4_post_ciga(
            eligiblity_counts=eligiblity_counts_remaining,
            input_data=input_data,
            ha_ciga_conversion_rate=ha_ciga_conversion_rate,
            ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate,
            ha_eco4_to_sale_rate=ha_eco4_to_sale_rate,
            eco4_rate=eco4_rate,
            archetype_conversion_rate=archetype_conversion_rate
        )

        # Calculate the delta compared to Warmfront's original remaining
        if original_warmfront_remaining_eco4 == 0:
            eco4_delta_vs_original_estimate_remaining = "N/A"
        else:
            eco4_delta_vs_original_estimate_remaining = ((eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] -
                                                          original_warmfront_remaining_eco4) /
                                                         original_warmfront_remaining_eco4)

        # GBIS Figures
        # Estimate the GBIS conversion rate
        ha_gbis_sale_conversion = gbis_to_install_clipped[
            gbis_to_install_clipped["Ha Name"] == ha_name
            ]

        if not ha_gbis_sale_conversion.empty:
            ha_gbis_sale_conversion = (
                ha_gbis_sale_conversion["# GBIS successfully installed"].values[0] /
                ha_gbis_sale_conversion["# GBIS at install stage"].values[0]
            )
        else:
            ha_gbis_sale_conversion = median_gbis_to_install

        gbis_total_pre_cancellations = eligiblity_counts[
            eligiblity_counts["ECO Eligibility"] == "gbis"
            ]["count"].sum()

        gbis_total_pre_cancellations_revenue = gbis_total_pre_cancellations * gbis_rate
        # gbis_total = int(np.round(gbis_total_pre_cancellations * ha_gbis_sale_conversion))
        # gbis_total_revenue = int(gbis_total * gbis_rate)

        gbis_remaining_pre_cancellations = eligiblity_counts_remaining[
            eligiblity_counts_remaining["ECO Eligibility"] == "gbis"
            ]["count"].sum()
        gbis_remaining_pre_cancellations_revenue = (
            gbis_remaining_pre_cancellations * gbis_rate
        )
        # This is the gbis jobs we expect to sell
        gbis_remaining = int(np.round(gbis_remaining_pre_cancellations * ha_gbis_sale_conversion))
        gbis_remaining_revenue = int(gbis_remaining * gbis_rate)
        # This is the number we expect to cancel
        gbis_remaining_expected_cancellations = int(gbis_remaining_pre_cancellations - gbis_remaining)
        gbis_remaining_expected_cancellations_revenue = gbis_remaining_expected_cancellations * gbis_rate

        # GBIS delta
        if original_warmfront_remaining_gbis == 0:
            gbis_delta_vs_original_estimate_remaining = "N/A"
        else:
            gbis_delta_vs_original_estimate_remaining = (
                (gbis_remaining - original_warmfront_remaining_gbis) / original_warmfront_remaining_gbis
            )

        # Current sales figures
        # For any sales surveys that are complete, that could still cancel, we apply a conversion rate
        eco4_actually_sold = 0
        eco4_confirmed_cancellations = 0
        eco4_expected_cancellations = 0

        gbis_actually_sold = 0
        gbis_confirmed_cancellations = 0
        gbis_expected_cancellations = 0
        if not survey_list.empty:
            surveys_with_eligibility = survey_list.merge(
                asset_list[["asset_list_row_id", "ECO Eligibility"]],
                how="left", on="asset_list_row_id"
            )
            completed_eco4_sales = surveys_with_eligibility[
                surveys_with_eligibility["installation_status"] == "ECO4 - installed"
                ].shape[0]
            incomplete_eco4_sales = surveys_with_eligibility[
                (surveys_with_eligibility["installation_status"] == "ECO4 - in progress") &
                (~surveys_with_eligibility["ECO Eligibility"].isin(
                    ["eco4 - passed ciga"])
                 )
                ].shape[0]
            incomplete_eco4_sales_ciga = surveys_with_eligibility[
                (surveys_with_eligibility["installation_status"] == "ECO4 - in progress") &
                (surveys_with_eligibility["ECO Eligibility"].isin(
                    ["eco4 - passed ciga"])
                )
                ].shape[0]

            eco4_confirmed_cancellations = surveys_with_eligibility[
                surveys_with_eligibility["installation_status"] == "ECO4 - cancelled"
                ].shape[0]

            expected_eco4_sales_no_ciga = np.round(incomplete_eco4_sales * ha_eco4_to_sale_rate)
            expected_eco4_sales_ciga = np.round(incomplete_eco4_sales_ciga * ha_ciga_pass_to_sale_rate)

            eco4_expected_cancellations = (incomplete_eco4_sales + incomplete_eco4_sales_ciga) - (
                expected_eco4_sales_no_ciga + expected_eco4_sales_ciga
            )
            eco4_expected_cancellations = int(np.round(eco4_expected_cancellations))

            eco4_actually_sold = eco4_rate * (
                completed_eco4_sales + expected_eco4_sales_no_ciga + expected_eco4_sales_ciga
            )

            completed_gbis_sales = surveys_with_eligibility[
                surveys_with_eligibility["installation_status"] == "GBIS - installed"
                ].shape[0]
            incomplete_gbis_sales = surveys_with_eligibility[
                (surveys_with_eligibility["installation_status"] == "GBIS - in progress")
            ].shape[0]

            # Get confirmed cancellations
            gbis_confirmed_cancellations = surveys_with_eligibility[
                surveys_with_eligibility["installation_status"] == "GBIS - cancelled"
                ].shape[0]

            expected_gbis_unconfirmed_sales = np.round(incomplete_gbis_sales * ha_gbis_sale_conversion)

            gbis_expected_cancellations = int(incomplete_gbis_sales - expected_gbis_unconfirmed_sales)

            gbis_actually_sold = completed_gbis_sales * gbis_rate + (
                expected_gbis_unconfirmed_sales * gbis_rate
            )

        # Add in the variance:
        # We should expect that the pre-ciga total is:
        # 1) The number of post CIGA successes +
        # 2) The number of archetype failures +
        # 2) the number of CIGA failures +
        # 3) The number of cancellations
        variance_total = eco4_pre_ciga - (
            eco4_post_ciga_total_results["ECO4 - post CIGA - #"] +
            eco4_post_ciga_total_results["Estimated total - failed archetype check - #"] +
            eco4_post_ciga_total_results['Estimated total - failed CIGA'] +
            eco4_post_ciga_total_results["Expected cancellations - #"]
        )
        if variance_total != 0:
            raise ValueError("Something went wrong in variance total")

        variance_remaining = eco4_pre_ciga_remaining - (
            eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] +
            eco4_post_ciga_remaining_results["Estimated total - failed archetype check - #"] +
            eco4_post_ciga_remaining_results['Estimated total - failed CIGA'] +
            eco4_post_ciga_remaining_results["Expected cancellations - #"]
        )

        if variance_remaining != 0:
            raise ValueError("Something went wrong in variance remaining")

        # We also check variances to make sure that the pre-CIGA ECO4 total equals
        # 1) Pre CIGA remaining +
        # 2) ECO4 sold +
        # 3) ECO4 confirmed cancellations +
        # 4) ECO4 unconfirmed cancellations

        pre_ciga_eco4_variance = (
            eco4_pre_ciga_revenue -
            eco4_pre_ciga_remaining_revenue -
            eco4_actually_sold -
            eco4_confirmed_cancellations * eco4_rate -
            eco4_expected_cancellations * eco4_rate
        )

        if pre_ciga_eco4_variance != 0:
            raise ValueError("Something went wrong in pre_ciga_eco4_variance")

        # Check GBIS total variance
        # The total before cancellations should equal:
        # The number of sold +
        # The number of confirmed cancelled +
        # The number of expected cancelled +
        # The number of remaining
        gbis_variance = gbis_total_pre_cancellations - (
            gbis_actually_sold / gbis_rate +
            gbis_confirmed_cancellations +
            gbis_expected_cancellations +
            gbis_remaining_pre_cancellations
        )

        if gbis_variance != 0:
            raise ValueError("Something went wrong in gbis_variance")

        # We expect the remaining to equal expected sales + expected cancellations
        gbis_variance_2 = gbis_remaining_pre_cancellations - (
            gbis_remaining +
            gbis_remaining_expected_cancellations
        )

        if gbis_variance_2 != 0:
            raise ValueError("Something went wrong in gbis_variance2")

        # Update the GBIS sold, since Warmfront often sold more GBIS that expected
        original_warmfront_gbis_revenue = original_warmfront_sold_gbis + original_warmfront_remaining_gbis_revenue
        original_warmfront_gbis = (
            original_warmfront_sold_gbis / gbis_rate + original_warmfront_remaining_gbis_revenue / gbis_rate
        )

        to_append = {
            ("", "", "", "HA Name"): ha_name,
            # ECO4 - original warmfront figures
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4,
            ("ECO4 original", "", "Remaining - #", ""): original_warmfront_remaining_eco4,
            ("ECO4 original", "", "Total - £", ""): original_warmfront_eco4_revenue,
            ("ECO4 original", "", "Sold or cancelled - £", ""): original_warmfront_sold_eco4,
            ("ECO4 original", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue,
            # GBIS - original warmfront figures
            ("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis,
            ("GBIS original", "", "Remaining - #", ""): original_warmfront_gbis,
            ("GBIS original", "", "Total - £", ""): original_warmfront_gbis_revenue,
            ("GBIS original", "", "Sold or cancelled - £", ""): original_warmfront_sold_gbis,
            ("GBIS original", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue,
            # ECO4 - asset list, pre-ciga
            ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga,
            ("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining,
            ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue,
            ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue,
            ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL", ""): pre_ciga_eco4_variance,
            ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL VS ELIGIBLE & INELIGIBLE", ""): variance_total,
            ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 REMAINING VS ELIGIBLE & INELIGIBLE", ""):
                variance_remaining,
            ("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold,
            ("ECO4 pre-ciga", "", "Confirmed cancellations - £", ""): eco4_confirmed_cancellations * eco4_rate,
            # This is for jobs that are in-progress and could still cancel
            ("ECO4 pre-ciga", "", "Unconfirmed cancellations - £", ""): eco4_expected_cancellations * eco4_rate,
            # ECO4 - asset list, post ciga, total
            ("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total"):
                eco4_post_ciga_total_results[
                    "ECO4 - post CIGA - #"],
            ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_total_results[
                "ECO4 - post CIGA - £"],
            # ECO4 - asset list, post ciga, remaining
            ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining_results[
                "ECO4 - post CIGA - #"],
            ("ECO4 post-ciga", "", "Estimated remaining eligible - £", ""): eco4_post_ciga_remaining_results[
                "ECO4 - post CIGA - £"],
            ("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %",
             ""): eco4_delta_vs_original_estimate_remaining,
            ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - #", ""):
                eco4_post_ciga_remaining_results["Of which confirmed - #"],
            ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - £", ""):
                eco4_post_ciga_remaining_results["Of which confirmed - £"],
            ("ECO4 post-ciga", "", "Of which forecast - #", ""):
                eco4_post_ciga_remaining_results["Of which forecast - #"],
            ("ECO4 post-ciga", "", "Of which forecast - £", ""):
                eco4_post_ciga_remaining_results["Of which forecast - £"],
            # Expected ECO4 cancellations
            ("ECO4 Cancellations", "", "Of which expected cancellations - #", ""): eco4_post_ciga_remaining_results[
                "Expected cancellations - #"
            ],
            ("ECO4 Cancellations", "", "Of which expected cancellations - £", ""): eco4_post_ciga_remaining_results[
                "Expected cancellations - £"
            ],
            # Archetype check failures
            ("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - #", ""):
                eco4_post_ciga_remaining_results['Estimated total - failed archetype check - #'],
            ("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - £", ""):
                eco4_post_ciga_remaining_results['Estimated total - failed archetype check - £'],
            # CIGA failures
            ("ECO4 CIGA failures", "", "Estimated total - failed CIGA - #", ""): eco4_post_ciga_remaining_results[
                'Estimated total - failed CIGA'
            ],
            ("ECO4 CIGA failures", "", "Estimated total - failed CIGA - £", ""): eco4_post_ciga_remaining_results[
                'Estimated total - failed CIGA - £'
            ],
            ("ECO4 CIGA failures", "", "Confirmed failures - #", ""): eco4_post_ciga_remaining_results[
                "Confirmed CIGA failures"
            ],
            ("ECO4 CIGA failures", "", "Confirmed failures - £", ""): eco4_post_ciga_remaining_results[
                "Confirmed CIGA failures - £"
            ],
            ("ECO4 CIGA failures", "", "Estimated failures - #", ""): eco4_post_ciga_remaining_results[
                "Estimated CIGA failures"
            ],
            ("ECO4 CIGA failures", "", "Estimated failures - £", ""): eco4_post_ciga_remaining_results[
                "Estimated CIGA failures - £"
            ],
            # GBIS postcode list
            ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total_pre_cancellations,
            ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"):
                gbis_total_pre_cancellations_revenue,
            ("GBIS Postcode list", "Warmfront post code list", "GBIS VARIANCE", "GBIS total"): gbis_variance,
            ("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold,
            ("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations * gbis_rate,
            # This is for jobs that are in-progress and could still cancel
            ("GBIS Postcode list", "", "Unconfirmed cancellations - £", ""): gbis_expected_cancellations * gbis_rate,
            ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"):
                gbis_remaining_pre_cancellations,
            ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"):
                gbis_remaining_pre_cancellations_revenue,
            ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", ""):
                gbis_delta_vs_original_estimate_remaining,
            # Expected cancellations
            (
                "GBIS Postcode list", "", "Of which expected sales - £ - £",
                "GBIS total"): gbis_remaining_revenue,
            ("GBIS Postcode list", "", "Of which expected cancellations -£", "GBIS total"):
                gbis_remaining_expected_cancellations_revenue
        }

        # Make sure nothing is forgotten due to duplicate multi-index keys
        if len(to_append) != 51:
            raise ValueError("Something went wrong")

        results.append(to_append)

    results = pd.DataFrame(results)
    results.to_csv("pipeline_remaining_raw.csv")

    totals_row = {}
    for col in results.columns:
        if col == ('', '', '', 'HA Name'):
            totals_row[col] = "Total"
        elif col in [
            ("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", ""),
            ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "")
        ]:
            totals_row[col] = None
        else:
            totals_row[col] = results[col].sum()

    # For the delta columns, we calculate the delta on the totals
    totals_row[("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", "")] = (
        (
            totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")] -
            totals_row[("ECO4 original", "", "Remaining - #", "")]
        ) / totals_row[("ECO4 original", "", "Remaining - #", "")]
    )

    totals_row[("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "")] = (
        (
            totals_row[("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total")] -
            totals_row[("GBIS original", "", "Remaining - #", "")]
        ) / totals_row[("GBIS original", "", "Remaining - #", "")]
    )

    blank_row = pd.DataFrame([{col: "" for col in results.columns}])

    # Put together a Warmfront original remaining ECO4 vs asset list remaining ECO4 and same for GBIS, as well as totals

    # ECO4 Headlines
    headline_eco4_original_remaining = totals_row[("ECO4 original", "", "Remaining - #", "")]
    headline_eco4_original_remaining_revenue = totals_row[("ECO4 original", "", "Remaining - £", "")]
    headline_eco4_postcode_list_remaining = totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")]
    headline_eco4_postcode_list_remaining_revenue = totals_row[
        ("ECO4 post-ciga", "", "Estimated remaining eligible - £", "")
    ]
    headline_eco4_delta = 100 * (
        (headline_eco4_postcode_list_remaining - headline_eco4_original_remaining) /
        headline_eco4_original_remaining
    )
    headline_eco4_delta = round(headline_eco4_delta, 1)

    # GBIS Headlines
    headline_gbis_original_remaining = totals_row[("GBIS original", "", "Remaining - #", "")]
    headline_gbis_original_remaining_revenue = totals_row[("GBIS original", "", "Remaining - £", "")]
    headline_gbis_postcode_list_remaining = totals_row[
        ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total")
    ]
    headline_gbis_postcode_list_remaining_revenue = totals_row[
        ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total")
    ]
    headline_gbis_delta = 100 * (
        (headline_gbis_postcode_list_remaining - headline_gbis_original_remaining) /
        headline_gbis_original_remaining
    )
    headline_gbis_delta = round(headline_gbis_delta, 1)

    headline_original_total_revenue_remaining = (
        headline_eco4_original_remaining_revenue + headline_gbis_original_remaining_revenue
    )

    headline_postcode_list_total_revenue_remaining = (
        headline_eco4_postcode_list_remaining_revenue + headline_gbis_postcode_list_remaining_revenue
    )
    headline_total_delta = 100 * (
        (headline_postcode_list_total_revenue_remaining - headline_original_total_revenue_remaining) /
        headline_original_total_revenue_remaining
    )
    headline_total_delta = round(headline_total_delta, 1)

    headline_eco4_sold_since_november = (
        totals_row[('ECO4 pre-ciga', '', 'Sold - £', '')] +
        totals_row[('ECO4 pre-ciga', '', 'Confirmed cancellations - £', '')] +  # confirmed canclleations
        totals_row[('ECO4 pre-ciga', '', 'Unconfirmed cancellations - £', '')] -  # expected cancellations
        totals_row[('ECO4 original', '', 'Sold or cancelled - £', '')]
    )

    headline_gbis_sold_since_november = (
        totals_row[("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total")] +
        totals_row[("GBIS Postcode list", "", "Confirmed cancellations - £", "")] +  # confirmed cancellations
        totals_row[("GBIS Postcode list", "", "Unconfirmed cancellations - £", "")] -  # expected cancellations
        totals_row[('GBIS original', '', 'Sold or cancelled - £', '')]
    )

    headlines = [
        {
            ("", "", "", "HA Name"): "Headlines",
        },
        {
            ("", "", "", "HA Name"): "ECO4 Remaining - November - #",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_original_remaining

        },
        {
            ("", "", "", "HA Name"): "ECO4 Remaining - November - £",
            (
                "", "Original Warmfront estimate", "Total - #",
                "ECO4 - November"): headline_eco4_original_remaining_revenue
        },
        {
            ("", "", "", "HA Name"): "ECO4 Sold or cancelled since November - £",
            (
                "", "Original Warmfront estimate", "Total - #",
                "ECO4 - November"): headline_eco4_sold_since_november
        },
        {
            ("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - #",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_postcode_list_remaining
        },
        {
            ("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - £",
            ("", "Original Warmfront estimate", "Total - #",
             "ECO4 - November"): headline_eco4_postcode_list_remaining_revenue
        },
        {
            ("", "", "", "HA Name"): "ECO4 £ remaining delta - %",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_eco4_delta) + "%"
        },
        {
            ("", "", "", "HA Name"): "GBIS Remaining - November - #",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_original_remaining
        },
        {
            ("", "", "", "HA Name"): "GBIS Remaining - November - £",
            (
                "", "Original Warmfront estimate", "Total - #",
                "ECO4 - November"): headline_gbis_original_remaining_revenue
        },
        {
            ("", "", "", "HA Name"): "GBIS Sold or cancelled since November - £",
            (
                "", "Original Warmfront estimate", "Total - #",
                "ECO4 - November"): headline_gbis_sold_since_november
        },
        {
            ("", "", "", "HA Name"): "GBIS Remaining - post code list - #",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_postcode_list_remaining
        },
        {
            ("", "", "", "HA Name"): "GBIS Remaining - post code list - £",
            ("", "Original Warmfront estimate", "Total - #",
             "ECO4 - November"): headline_gbis_postcode_list_remaining_revenue
        },
        {
            ("", "", "", "HA Name"): "GBIS delta %",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_gbis_delta) + "%"
        },
        # Total revenue
        {
            ("", "", "", "HA Name"): "Total Remaining - November - £",
            ("", "Original Warmfront estimate", "Total - #",
             "ECO4 - November"): headline_original_total_revenue_remaining
        },
        {
            ("", "", "", "HA Name"): "Total Remaining - post code list (post CIGA) - £",
            ("", "Original Warmfront estimate", "Total - #",
             "ECO4 - November"): headline_postcode_list_total_revenue_remaining
        },
        {
            ("", "", "", "HA Name"): "Total Remaining delta %",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_total_delta) + "%"
        },
    ]

    assumptions = [
        {
            ("", "", "", "HA Name"): "Assumptions",
        },
        {
            ("", "", "", "HA Name"): "ECO4 rate",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(eco4_rate)
        },
        {
            ("", "", "", "HA Name"): "GBIS rate",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(gbis_rate)
        },
        {
            ("", "", "", "HA Name"): "Median CIGA pass rate",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
                round(median_ciga_success_rate * 100, 1)) + "%",
        },
        {
            ("", "", "", "HA Name"): "Maximum allowed CIGA pass rate",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
                round(maximum_ciga_conversion * 100, 1)) + "%",
            ("ECO4 original", "", "Remaining - #",
             ""): "- Maximum allowed CIGA conversion for HAs without CIGA checks We do not allow above this to be "
                  "conservative"
        },
        {
            ("", "", "", "HA Name"): "Median ECO4 (no CIGA) sales conversion rate",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
                round(median_eco4_to_install * 100, 1)) + "%",
            ("ECO4 original", "", "Remaining - #",
             ""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check. Surveys that resulted "
                  "in cancelled install are excluded."
        },
        {
            ("", "", "", "HA Name"): "Median ECO4 (subect to CIGA) sales conversion rate",
            ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
                round(median_ciga_pass_to_install * 100, 1)) + "%",
            ("ECO4 original", "", "Remaining - #",
             ""): " - Sales conversion rate for a ECO4 property that passed a CIGA check. Surveys that resulted in "
                  "cancelled installs are excluded."
        }
    ]

    results = pd.concat(
        [
            results,
            pd.DataFrame([totals_row]),
            blank_row,
            pd.DataFrame(headlines),
            blank_row,
            blank_row,
            pd.DataFrame(assumptions)
        ]
    )
    with open("HA Remaining Analysis.csv", "w", newline="") as file:
        # Write the DataFrame data without the index (adjust if you want the index).
        results.to_csv(file, header=True, index=False)


def fml_data_pull(loader):
    has_bruh = [
        "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13",
        "HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12",
        "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49",
        'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20",
    ]

    # Can't pull from EPC database because it's based in Scotland
    # "HAXXX", "HAXX"
    # DO
    from backend.SearchEpc import SearchEpc
    epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA="

    failed_has = []
    for ha in has_bruh:
        print(f"Pulling data for {ha}")
        try:
            asset_list = loader.data[ha]["asset_list"].copy()
            # properties found as eligibile
            fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"]

            # For each property, search for the latest EPC
            epc_data = []
            for _, row in tqdm(fml.iterrows(), total=fml.shape[0]):

                property_type, _ = get_property_type_and_built_form(property_meta=row, ha_name=ha)

                if ha == "HAXXX":
                    to_join = [str(x) for x in
                               [row["Door Number"], row["Address Line 1"], row["Address Line 2"], row["Address Line 3"],
                                row["Postcode"]] if x is not None]
                    full_address = ", ".join(to_join)
                else:
                    full_address = row["matching_address"]

                searcher = SearchEpc(
                    address1=str(row["HouseNo"]),
                    postcode=row["matching_postcode"],
                    auth_token=epc_api_key,
                    os_api_key="",
                    property_type=property_type,
                    full_address=full_address,
                    fast=True
                )
                # Force the skipping of estimating the EPC
                searcher.ordnance_survey_client.property_type = None
                searcher.ordnance_survey_client.built_form = None

                searcher.find_property(skip_os=True)
                if searcher.newest_epc is None:
                    continue

                epc = {
                    "asset_list_row_id": row["asset_list_row_id"],
                    **searcher.newest_epc.copy()
                }

                epc_data.append(epc)

            # Remove None entries
            epc_data = [x for x in epc_data if x is not None]
            # Save the data in S3 as a parquet
            epc_data_df = pd.DataFrame(epc_data)
            save_pickle_to_s3(
                data=epc_data_df,
                bucket_name="retrofit-datalake-dev",
                s3_file_name=f"ha-analysis/revised/{ha}/epc_data.pickle"
            )
        except Exception as e:
            failed_has.append(ha)


def extract_lower_bound(age_band):
    if pd.isna(age_band):
        return 1930
    try:
        return int(age_band.split(':')[1].split('-')[0].strip())
    except (ValueError, IndexError):
        return 1930


def classify_loft(x):
    # high confidence
    if float(x["roof_insulation_thickness"]) <= 100:
        return "high"

    if float(x["roof_insulation_thickness"]) <= 200:
        return "medium"

    if float(x["roof_insulation_thickness"]) <= 270 and x["epc_age"] >= 5 * 365:
        return "medium"

    return "unlikely"


def fml_analysis(loader):
    # In the case of the optimistic scenario, we assume that the at-risk pipeline is still viable, just at a lower rate
    optimistic_scenario_rate = 1500

    assumed_ciga_pass_rate = 0.731
    has_bruh = [
        "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13",
        "HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12",
        "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49",
        'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20",
    ]

    no_ciga_cavity_descriptions = [
        "Cavity wall, as built, insulated (assumed)",
        "Cavity wall, as built, no insulation (assumed)",
        "Cavity wall, as built, partial insulation (assumed)",
        "Cavity wall, no insulation (assumed)",
        "Cavity wall, partial insulation (assumed)",
        "Cavity wall,",
        "Cavity wall, insulated (assumed)",
        "Cavity wall, no insulation (assumed)",
        "Cavity wall, as built, insulated (assumed)",
        "Cavity wall, partial insulation (assumed)",
    ]

    # TODO: There will be some properties that are subject to CIGA that do not look like they ned a CIGA check! pass
    #  them! Non-invasices will have checked the wall though

    results = []
    wall_descriptions = []
    for ha_name in tqdm(has_bruh):

        original_figures = loader.december_figures[
            loader.december_figures["HA Name"] == ha_name
            ].copy()
        original_remaining = original_figures["ECO4 remaining"].values[0]
        original_gbis_remaining = original_figures["GBIS remaining"].values[0]

        # Read in the epc data
        asset_list = loader.data[ha_name]["asset_list"].copy()
        # properties found as eligibile
        fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"]
        epc_data = read_pickle_from_s3(
            bucket_name="retrofit-datalake-dev",
            s3_file_name=f"ha-analysis/revised/{ha_name}/epc_data.pickle"
        )
        # We make sure we don't have duplicated. We do a super basic drop duplicates because it shouldn't be a huge
        # issue at this point
        epc_data = epc_data.drop_duplicates("uprn")
        wall_descriptions.extend(epc_data["walls-description"].unique().tolist())

        # time from the inspection to now
        epc_data["epc_age"] = (datetime.now() - pd.to_datetime(epc_data["inspection-date"])).dt.days
        if "estimated" not in epc_data.columns:
            # For all after HA7, we don't use estimated surveys
            epc_data["estimated"] = False

        fuck_this = fml.merge(
            epc_data, how="left", on="asset_list_row_id"
        )
        fuck_this["estimated"] = fuck_this["estimated"].fillna(True)
        if fuck_this.shape[0] != fml.shape[0]:
            raise Exception("What the fuck bruv")

        # Take just remaining
        if not loader.data[ha_name]["survey_list"].empty:
            survey_list = (
                loader.data[ha_name]["survey_list"][
                    ~pd.isnull(loader.data[ha_name]["survey_list"]["asset_list_row_id"])
                ]
            )
            fuck_this = fuck_this.merge(
                survey_list[["asset_list_row_id", "installation_status"]],
                how="left",
                on="asset_list_row_id"
            )
            # Anything that has an installation has gone to installation, and therefore is not remaining
            fuck_this = fuck_this[pd.isnull(fuck_this["installation_status"])]
            fuck_this = fuck_this.drop(columns=["installation_status"])

        insulation_thicknesses = []
        for _, x in fuck_this.iterrows():
            if pd.isnull(x["roof-description"]):
                continue
            if x["roof-description"] == "SAP05:Roof":
                continue

            thickness = RoofAttributes(x["roof-description"]).process()["insulation_thickness"]
            # If there is a + in the thickness, strip it out
            thickness = str(thickness).replace("+", "")
            insulation_thicknesses.append(
                {'uprn': x["uprn"], "roof_insulation_thickness": thickness}
            )
        insulation_thicknesses = pd.DataFrame(insulation_thicknesses)

        before_merge_shape = fuck_this.shape[0]
        fuck_this = fuck_this.merge(insulation_thicknesses, how="left", on="uprn")

        if fuck_this.shape[0] != before_merge_shape:
            raise Exception("SOMETHING WENT WRONG")

        # Automated archetype check
        if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")):
            # We perform the archetype test. If the property is a house, we it needs to be detached, semi-detached
            # or end terrace. If it's a bungalow, it must be attached
            fuck_this["passes_archetype"] = None
            fuck_this["passes_archetype"] = np.where(
                (fuck_this["property-type"] == "House") &
                (fuck_this["built-form"].isin(["Semi-Detached", "End-Terrace", "Detached"])),
                True,
                fuck_this["passes_archetype"]
            )

            fuck_this["passes_archetype"] = np.where(
                (fuck_this["property-type"] == "Bungalow") &
                (fuck_this["built-form"].isin(["Detached"])),
                True,
                fuck_this["passes_archetype"]
            )

            fuck_this["ECO Eligibility"] = np.where(
                (fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") &
                (fuck_this["passes_archetype"] == True),
                "eco4 (subject to ciga)",
                fuck_this["ECO Eligibility"]
            )

            # If failed the archetype check and needs a CIGA, it's not eligibile
            fuck_this["ECO Eligibility"] = np.where(
                (fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") &
                (fuck_this["passes_archetype"] != True),
                "not eligible",
                fuck_this["ECO Eligibility"]
            )

            fuck_this["ECO Eligibility"] = np.where(
                (fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") &
                (fuck_this["passes_archetype"] == True),
                "eco4",
                fuck_this["ECO Eligibility"]
            )

            fuck_this["ECO Eligibility"] = np.where(
                (fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") &
                (fuck_this["passes_archetype"] != True),
                "gbis",
                fuck_this["ECO Eligibility"]
            )

            if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")):
                raise Exception("DO THE DAMN ARCHETYPE CHECK BRO")

        # clean roof insulation
        fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0")
        fuck_this["roof_insulation_thickness"] = fuck_this[
            "roof_insulation_thickness"
        ].str.replace("below average", "50")
        fuck_this["roof_insulation_thickness"] = fuck_this[
            "roof_insulation_thickness"
        ].str.replace("None", "0")
        fuck_this["roof_insulation_thickness"] = fuck_this[
            "roof_insulation_thickness"
        ].str.replace("none", "0")
        fuck_this["roof_insulation_thickness"] = fuck_this[
            "roof_insulation_thickness"
        ].str.replace("average", "150")
        fuck_this["roof_insulation_thickness"] = fuck_this[
            "roof_insulation_thickness"
        ].str.replace("above 150", "150")

        fuck_this["roof_classiciation"] = fuck_this.apply(lambda x: classify_loft(x), axis=1)

        had_survey = fuck_this[fuck_this["estimated"] == False]

        # proportion with a survey:
        proportion_with_survey = 100 * had_survey.shape[0] / fuck_this.shape[0]

        # Let's look just at the ECO4 business
        # For things that had a survey, take the properties that didn't need a CIGA check
        no_ciga_check_needed = had_survey[
            had_survey["ECO Eligibility"] == "eco4"
            ]

        no_ciga_check_needed_eligible = no_ciga_check_needed[
            (no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) &
            (no_ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) &
            (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80)
            ]

        # For anything not needing a CIGA check, some of it will be GBIS
        no_ciga_check_needed_eligible_gbis = no_ciga_check_needed[
            (no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) &
            (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) &
            (~no_ciga_check_needed["asset_list_row_id"].isin(no_ciga_check_needed_eligible["asset_list_row_id"].values))
            ]

        # Characterise no CIGA check needed
        # !!!!!!!!!!!! AT RISK !!!!!!!!!!!!
        ciga_check_passed = had_survey[had_survey["ECO Eligibility"] == "eco4 - passed ciga"]
        # These should be treated the same as one that have passed their ciga checks, from a detection perspective
        ciga_check_passed_eligible = ciga_check_passed[
            (ciga_check_passed["walls-description"].str.lower().str.contains("cavity") == True) &
            (ciga_check_passed["roof_classiciation"].isin(["high", "medium"])) &
            (ciga_check_passed["current-energy-efficiency"].astype(float) <= 80)
            ]

        if not loader.data[ha_name]["ciga_list"].empty:

            proportions = loader.data[ha_name]["ciga_list"]["Guarantee"].value_counts(normalize=True)
            ha_ciga_pass_rate = proportions[proportions.index == "No"].values[0]

        else:
            ha_ciga_pass_rate = assumed_ciga_pass_rate

        # We take just the cavity walls
        # UCL paper: https://discovery.ucl.ac.uk/id/eprint/10110371/
        # This paper is based on London properties
        # The proportion of EPCs with building characteristics errors are shown to
        # differ between variables; floor and wall type errors occur in ~10-15% of EPCs,
        # compared with ~5% for wall insulation and glazing performance

        ciga_check_needed = had_survey[
            had_survey["ECO Eligibility"].str.contains("subject to ciga")
        ].copy()

        ciga_check_needed_eligible = ciga_check_needed[
            (ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) &
            (ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) &
            (ciga_check_needed["current-energy-efficiency"].astype(float) <= 80)
            ]

        # Finally, characterise gbis properties. Some of the business might look like ECO4 work, whereas we then
        # qualify what actually looks like gbis
        gbis_identified = had_survey[
            had_survey["ECO Eligibility"] == "gbis"
            ].copy()

        gbis_looks_like_eco4 = gbis_identified[
            (gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) &
            (gbis_identified["roof_classiciation"].isin(["high", "medium"])) &
            (gbis_identified["current-energy-efficiency"].astype(float) <= 80) &
            (
                (
                    (gbis_identified["property-type"] == "House") &
                    (gbis_identified["built-form"] != "Mid-Terrace")
                ) | (
                    (gbis_identified["property-type"] == "Bungalow") &
                    (gbis_identified["built-form"].isin(["Detached"]))
                )
            )
            ]

        gbis_qualified = gbis_identified[
            (gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) &
            (gbis_identified["current-energy-efficiency"].astype(float) <= 80) &
            (~gbis_identified["asset_list_row_id"].isin(gbis_looks_like_eco4["asset_list_row_id"].values))
            ]

        ciga_check_expectation = np.round(ciga_check_needed_eligible.shape[0] * ha_ciga_pass_rate)
        without_ciga_expectation = no_ciga_check_needed_eligible.shape[0]
        passed_ciga_expectation = ciga_check_passed_eligible.shape[0]
        identified_as_gbis_looks_like_eco4 = gbis_looks_like_eco4.shape[0]

        # Need to add on the non-ciga
        total_eco4_expectation = (
            ciga_check_expectation +
            without_ciga_expectation +
            passed_ciga_expectation +
            identified_as_gbis_looks_like_eco4
        )

        # This is the work that is at risk
        eco4_work_at_risk = (
            passed_ciga_expectation +
            ciga_check_expectation
        )

        no_ciga_check_needed_actually_gbis = no_ciga_check_needed_eligible_gbis.shape[0]
        gbis_qualified = gbis_qualified.shape[0]

        total_gbis_expectation = no_ciga_check_needed_actually_gbis + gbis_qualified

        if proportion_with_survey < 100:
            # We estimate the rest
            without_survey_needing_ciga = fuck_this[
                (fuck_this["estimated"] == True) &
                (fuck_this["ECO Eligibility"].str.contains("subject to ciga") == True)
                ]

            if without_survey_needing_ciga.empty:
                without_survey_without_ciga_expected = 0
            else:
                # We apply the same conversion rate as the properties with a survey

                if ciga_check_needed.shape[0] == 0 and ciga_check_expectation == 0:
                    without_survey_without_ciga_expected = without_survey_needing_ciga.shape[0]
                else:
                    without_survey_without_ciga_expected = np.round(
                        without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0])
                    )

            without_survey_passed_ciga = fuck_this[
                (fuck_this["estimated"] == True) &
                (fuck_this["ECO Eligibility"] == "eco4 - passed ciga")
                ]

            if without_survey_passed_ciga.empty:
                without_survey_passed_ciga_expected = 0
            else:
                # We apply the same conversion rate as the properties with a survey
                without_survey_passed_ciga_expected = np.round(
                    without_survey_passed_ciga.shape[0] * (passed_ciga_expectation / ciga_check_passed.shape[0])
                )

            # Finally, no ciga needed
            without_survey_eco4 = fuck_this[
                (fuck_this["estimated"] == True) &
                (fuck_this["ECO Eligibility"] == "eco4")
                ]

            if without_survey_eco4.empty:
                without_survey_eco4_expected = 0
                without_survey_gbis_expected = 0
            else:
                # We apply the same conversion rate as the properties with a survey
                without_survey_eco4_expected = np.round(
                    without_survey_eco4.shape[0] * (without_ciga_expectation / no_ciga_check_needed.shape[0])
                )

                without_survey_gbis_expected = np.round(
                    without_survey_eco4.shape[0] * (total_gbis_expectation / no_ciga_check_needed.shape[0])
                )

            # And gbis
            without_survey_gbis = fuck_this[
                (fuck_this["estimated"] == True) &
                (fuck_this["ECO Eligibility"] == "gbis")
                ]

            if without_survey_gbis.empty:
                without_survey_identified_as_gbis_qualified = 0
                without_survey_identified_as_gbis_eco4 = 0
            else:
                # We apply the same conversion rate as the properties with a survey
                without_survey_identified_as_gbis_qualified = np.round(
                    without_survey_gbis.shape[0] * (gbis_qualified / gbis_identified.shape[0])
                )

                without_survey_identified_as_gbis_eco4 = np.round(
                    without_survey_eco4.shape[0] * (identified_as_gbis_looks_like_eco4 / gbis_identified.shape[0])
                )

            total_eco4_expectation = (
                total_eco4_expectation +
                without_survey_without_ciga_expected +
                without_survey_passed_ciga_expected +
                without_survey_eco4_expected +
                without_survey_identified_as_gbis_eco4
            )

            total_gbis_expectation = (
                total_gbis_expectation +
                without_survey_gbis_expected +
                without_survey_identified_as_gbis_qualified
            )

        results.append(
            {
                "HA Name": ha_name,
                "Original ECO4 Estimate - Remaining": original_remaining,
                "Original GGBIS Estimate - Remaining": original_gbis_remaining,
                # "Postcode List - Remaining": postcode_list_remaining,
                # "Of which sold": sales_since_nov,
                "EPC verified ECO4 Eligible - Remaining": int(total_eco4_expectation),
                "EPC verified GBIS Eligibile - Remaining": int(total_gbis_expectation),
                # At risk work
                "Work at risk due to audits": eco4_work_at_risk
            }
        )

    results_df = pd.DataFrame(results)
    results_df.to_csv("analysis - revised - audit update.csv")

    # results_df["Delta vs November"] = 100 * (
    #     results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"]
    # ) / results_df["Original ECO4 Estimate - Remaining"]

    # TODO: Add in estimated GBIS (for eco jobs, of which look like gbis)
    # TODO: Change the left hand side number for our post CIGA estimates


def create_final_report():
    """
    This function will produce the final output for the HA analysis
    :return:
    """
    epc_validated_results = pd.read_csv("analysis - revised - audit update.csv")
    pipeline_results = pd.read_csv("pipeline_remaining_raw.csv")

    ####################################
    # Original Warmfront estimates
    ####################################
    # Create the volumes result
    all_ha_summary_remaining = pipeline_results[
        [
            "('', '', '', 'HA Name')",
            "('ECO4 original', '', 'Remaining - #', '')",
            "('GBIS original', '', 'Remaining - #', '')",
        ]
    ].copy().rename(
        columns={
            "('', '', '', 'HA Name')": "HA Name",
            "('ECO4 original', '', 'Remaining - #', '')": "# ECO4 remaining - All HA Summary",
            "('GBIS original', '', 'Remaining - #', '')": "# GBIS remaining - All HA Summary",
        }
    )
    all_ha_summary_remaining["# Total remaining - All HA Summary"] = (
        all_ha_summary_remaining["# ECO4 remaining - All HA Summary"] +
        all_ha_summary_remaining["# GBIS remaining - All HA Summary"]
    )
    all_ha_summary_remaining = all_ha_summary_remaining.sort_values("HA Name")

    ####################################
    # Postcode list - pre-CIGA
    ####################################
    postcode_list_pre_ciga_remaining = pipeline_results[
        [
            "('', '', '', 'HA Name')",
            "('ECO4 pre-ciga', '', 'Remaining - #', '')",
            "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')",
        ]
    ].copy().rename(
        columns={
            "('', '', '', 'HA Name')": "HA Name",
            "('ECO4 pre-ciga', '', 'Remaining - #', '')": "# ECO4 remaining - Postcode list (pre CIGA)",
            "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": (
                "# GBIS remaining - Postcode list (pre CIGA)"
            ),
        }
    )

    postcode_list_pre_ciga_remaining["# Total remaining - Postcode list (pre CIGA)"] = (
        postcode_list_pre_ciga_remaining["# ECO4 remaining - Postcode list (pre CIGA)"] +
        postcode_list_pre_ciga_remaining["# GBIS remaining - Postcode list (pre CIGA)"]
    )
    postcode_list_pre_ciga_remaining = postcode_list_pre_ciga_remaining.sort_values("HA Name")

    ####################################
    # Postcode list - post-CIGA
    ####################################
    postcode_list_post_ciga_remaining = pipeline_results[
        [
            "('', '', '', 'HA Name')",
            "('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')",
            "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')",
        ]
    ].copy().rename(
        columns={
            "('', '', '', 'HA Name')": "HA Name",
            "('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')":
                "# ECO4 remaining - Postcode list (post CIGA)",
            "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": (
                "# GBIS remaining - Postcode list (post CIGA)"
            ),
        }
    )

    postcode_list_post_ciga_remaining["# Total remaining - Postcode list (post CIGA)"] = (
        postcode_list_post_ciga_remaining["# ECO4 remaining - Postcode list (post CIGA)"] +
        postcode_list_post_ciga_remaining["# GBIS remaining - Postcode list (post CIGA)"]
    )
    postcode_list_post_ciga_remaining = postcode_list_post_ciga_remaining.sort_values("HA Name")

    ####################################
    # From EPC Database
    ####################################
    from_epc_database = epc_validated_results[
        [
            "HA Name",
            "EPC verified ECO4 Eligible - Remaining",
            "EPC verified GBIS Eligibile - Remaining",
            "Work at risk due to audits"
        ]
    ].copy().rename(
        columns={
            "EPC verified ECO4 Eligible - Remaining": "# ECO4 remaining - From EPC Database (post CIGA)",
            "EPC verified GBIS Eligibile - Remaining": "# GBIS remaining - From EPC Database (post CIGA)",
            "Work at risk due to audits": "ECO4 remaining work at risk due to Audits",
        }
    )

    from_epc_database["# Total remaining - From EPC Database (post CIGA)"] = (
        from_epc_database["# ECO4 remaining - From EPC Database (post CIGA)"] +
        from_epc_database["# GBIS remaining - From EPC Database (post CIGA)"]
    )
    from_epc_database = from_epc_database.sort_values("HA Name")

    # Combine the datasets
    volumes = all_ha_summary_remaining.merge(
        postcode_list_pre_ciga_remaining, how="left", on="HA Name"
    ).merge(
        postcode_list_post_ciga_remaining, how="left", on="HA Name"
    ).merge(
        from_epc_database, how="inner", on="HA Name"
    )

    revenue = volumes.copy()
    # Convert the ECO4 volumes to revenue
    for col in [
        '# ECO4 remaining - All HA Summary',
        '# ECO4 remaining - Postcode list (pre CIGA)',
        '# ECO4 remaining - Postcode list (post CIGA)',
        '# ECO4 remaining - From EPC Database (post CIGA)',
        'ECO4 remaining work at risk due to Audits'
    ]:
        revenue[col] = revenue[col] * 1710

    # Convert the GBIS volumes to revenue
    for col in [
        '# GBIS remaining - All HA Summary',
        '# GBIS remaining - Postcode list (pre CIGA)',
        '# GBIS remaining - Postcode list (post CIGA)',
        '# GBIS remaining - From EPC Database (post CIGA)'
    ]:
        revenue[col] = revenue[col] * 600

    # Re-calculate the totals
    revenue['# Total remaining - All HA Summary'] = (
        revenue['# ECO4 remaining - All HA Summary'] + revenue['# GBIS remaining - All HA Summary']
    )

    revenue['# Total remaining - Postcode list (pre CIGA)'] = (
        revenue['# ECO4 remaining - Postcode list (pre CIGA)'] + revenue['# GBIS remaining - Postcode list (pre CIGA)']
    )

    revenue['# Total remaining - Postcode list (post CIGA)'] = (
        revenue['# ECO4 remaining - Postcode list (post CIGA)'] + revenue[
        '# GBIS remaining - Postcode list (post CIGA)']
    )

    revenue['# Total remaining - From EPC Database (post CIGA)'] = (
        revenue['# ECO4 remaining - From EPC Database (post CIGA)'] +
        revenue['# GBIS remaining - From EPC Database (post CIGA)']
    )

    # Replace the # with £ in the columns
    revnue_colnames = [col.replace("#", "£") for col in revenue.columns]
    revenue.columns = revnue_colnames

    # We check that each column gets smaller
    decreasing_check1 = all(
        volumes["# ECO4 remaining - Postcode list (pre CIGA)"] >= volumes[
            '# ECO4 remaining - Postcode list (post CIGA)']
    )
    if not decreasing_check1:
        raise ValueError("decreasing_check1 failed")

    # Just HA32 and HA17 should fail this, and it's due to GBIS jobs looking like ECO4
    decreasing_check2 = volumes[volumes["# ECO4 remaining - From EPC Database (post CIGA)"] > volumes[
        "# ECO4 remaining - Postcode list (post CIGA)"]]

    if set(decreasing_check2["HA Name"].tolist()) != {"HA17", "HA32"}:
        raise ValueError("decreasing_check2 failed")

    # Check for GBIS
    decreasing_check3 = all(
        volumes["# GBIS remaining - Postcode list (pre CIGA)"] >= volumes[
            '# GBIS remaining - Postcode list (post CIGA)']
    )

    if not decreasing_check3:
        raise ValueError("decreasing_check3 failed")

    # Don't perform this - this happens for multiple
    # decreasing_check4 = volumes[volumes["# GBIS remaining - From EPC Database (post CIGA)"] > volumes[
    #     "# GBIS remaining - Postcode list (post CIGA)"]]

    # Store final outputs
    volumes.to_csv("HA Analysis - Audit Update - volumes.csv")
    revenue.to_csv("HA Analysis - Audit Update - revenue.csv")


def identify_eco_works(loader):
    # ha_names = [
    #     "HA16",  # For Housing
    #     "HA39",  # Rooftop
    #     "HA41",  # Settle
    #     "HA23",  # Lambeth
    #     "HA14",  # EMH
    #     "HA7",  # Believe
    #     "HA102",  # Thrive
    # ]

    # Unitas, fairhive, acis, LHP
    ha_names = [
        "HA50",  # Unitas
        "HA15",  # Fairhive
        "HA107",  # ACIS
        "HA24",  # LHP
    ]
    names = {
        "HA50": "Unitas",
        "HA15": "Fairhive",
        "HA107": "ACIS",
        "HA24": "LHP"
    }

    # gbis rate
    breakdowns = []
    # lists = {}
    for ha, data_assets in loader.data.items():
        if ha not in ha_names:
            continue

        asset_list = data_assets["asset_list"].copy()
        survey_list = data_assets["survey_list"].copy()
        # Remove things that have sold
        if not survey_list.empty:
            asset_list = asset_list.merge(
                survey_list[["asset_list_row_id", "installation_status"]],
                how="left",
                on="asset_list_row_id"
            )
            # Anything that has an installation has gone to installation, and therefore is not remaining
            asset_list = asset_list[pd.isnull(asset_list["installation_status"])]
            asset_list = asset_list.drop(columns=["installation_status"])

        # Needing a CIGA check
        needs_cga = asset_list[
            asset_list["ECO Eligibility"] == "eco4 (subject to ciga)"
            ].copy()

        eco4 = asset_list[
            asset_list["ECO Eligibility"] == "eco4"
            ].copy()

        eco4_passed_ciga = asset_list[
            asset_list["ECO Eligibility"] == "eco4 - passed ciga"
            ].copy()

        # lists[ha] = {
        #     "needs_cga": needs_cga,
        #     "eco4": eco4,
        #     "eco4_passed_ciga": eco4_passed_ciga
        # }

        # Store the data
        if not needs_cga.empty:
            needs_cga.to_csv(f"local_data/{names[ha]} - needs ciga.csv")

        if not eco4.empty:
            eco4.to_csv(f"local_data/{names[ha]} - eco4.csv")

        if not eco4_passed_ciga.empty:
            eco4_passed_ciga.to_csv(f"local_data/{names[ha]} - eco4 passed ciga.csv")

        summary = {
            "HA Name": ha,
            "n_needing_ciga": needs_cga.shape[0],
            "eco4": eco4.shape[0],
            "eco4_passed_ciga": eco4_passed_ciga.shape[0]
        }

        breakdowns.append(summary)
    breakdowns = pd.DataFrame(breakdowns)
    breakdowns = breakdowns.fillna(0)


def unitas_data_prep(loader):
    #####
    # Adhoc - for UNITAS, stripping out additional surveys that have been completed
    unitas_data = loader.data["HA50"].copy()
    unitas_asset_list = unitas_data["asset_list"].copy()
    unitas_survey_sheet = unitas_data["survey_list"].copy()

    # We remove the surveyed properties from the asset sheet
    unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])]
    unitas_asset_list = unitas_asset_list.merge(
        unitas_survey_sheet[["asset_list_row_id", "installation_status"]],
        how="left",
        on="asset_list_row_id"
    )
    unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])]
    unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"])

    # We read in the data for the further completed surveys
    unitas_phase_1_workbook = openpyxl.load_workbook(
        "local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx"
    )
    phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"]
    phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"]
    phase1_colnames = [cell.value for cell in phase_1_worksheet[1]]
    phase_1_rows_data = []
    for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False):
        row_data = [cell.value for cell in row]  # This will get you the cell values
        phase_1_rows_data.append(row_data)

    phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames)

    # Correct phase 1 surveys in the same fashion as the previous approach
    phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy())

    # We check all phase 1 surveys are contained in the data we had before
    additional = []
    for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)):
        # We look for the entry in the old survey sheet:
        # matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]]
        # if matched_uprn.shape[0] == 1:
        #     continue

        matched_1 = unitas_survey_sheet[
            (unitas_survey_sheet["Post Code"] == row["Post Code"]) &
            (unitas_survey_sheet["NO."] == row["NO."])
            ]

        if matched_1.shape[0] == 1:
            continue

        matched_2 = unitas_survey_sheet[
            (unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) &
            (unitas_survey_sheet["NO."] == row["NO."])
            ]

        if matched_2.shape[0] == 1:
            continue

        additional.append(row.to_dict())
    additional = pd.DataFrame(additional)

    phase_2_rows_data = []
    for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False):
        row_data = [cell.value for cell in row]  # This will get you the cell values
        phase_2_rows_data.append(row_data)

    phase2_colnames = [cell.value for cell in phase_2_worksheet[1]]
    phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames)
    # Drop all of the occurances of "OFFICE USE ONLY" columns
    phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c])
    common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns})
    additional_filtered = additional[common_columns]

    further_unitas_completed_surveys = pd.concat(
        [phase_2_surveys, additional_filtered],
        axis=0,
        ignore_index=True
    )

    # Add a phase 2 key
    further_unitas_completed_surveys["survey_list_row_id"] = [
        "unitas_phase_2" + str(i) for i in further_unitas_completed_surveys.index
    ]

    not_in_asset_list = [
        "unitas_phase_20", "unitas_phase_234", "unitas_phase_2163", "unitas_phase_2173", "unitas_phase_2374"
    ]

    additional_postcodes = ["st28bg"]

    full_asset_list = unitas_data["asset_list"].copy()
    full_asset_list["matching_postcode"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
    further_unitas_completed_surveys["Post Code"] = further_unitas_completed_surveys["Post Code"].str.replace(
        "ST 5DT", "ST3 5DT"
    )

    # We match these back to the asset list
    matching_lookup = []
    for _, row in tqdm(further_unitas_completed_surveys.iterrows(), total=len(further_unitas_completed_surveys)):

        if row["survey_list_row_id"] in not_in_asset_list:
            continue

        postcode_lower = row["Post Code"].lower().strip().replace(" ", "")
        if postcode_lower in additional_postcodes:
            continue

        # Confirmed not in asset lsit
        # Filter asset list on postcode
        df = full_asset_list[
            full_asset_list["matching_postcode"].str.contains(postcode_lower)
        ]

        df = df[df["HouseNo"] == str(row["NO."])]

        if df.shape[0] != 1:
            raise Exception("NOT FOUND")

        matching_lookup.append(
            {
                "survey_list_row_id": row["survey_list_row_id"],
                "asset_list_row_id": df["asset_list_row_id"].values[0],
            }
        )

    matching_lookup = pd.DataFrame(matching_lookup)
    matching_lookup["phase_2_surveyed"] = True

    # We merge this onto the asset list and remove the rows
    unitas_asset_list = unitas_asset_list.merge(
        matching_lookup, how="left", on="asset_list_row_id"
    )
    # Drop rows where phase_2_surveyed is populated
    unitas_asset_list = unitas_asset_list[
        pd.isnull(unitas_asset_list["phase_2_surveyed"])
    ]

    # We add in the new CIGA submissions
    unitas_round_2_ciga_workbook = openpyxl.load_workbook("local_data/ha_data/Unitas second round CIGA checks.xlsx")
    ciga_round_2_worksheet = unitas_round_2_ciga_workbook["Worksheet"]
    ciga_round_2_colnames = [cell.value for cell in ciga_round_2_worksheet[1]]
    round_2_rows_data = []
    for row in ciga_round_2_worksheet.iter_rows(min_row=2, values_only=False):
        row_data = [cell.value for cell in row]  # This will get you the cell values
        round_2_rows_data.append(row_data)

    ciga_round_2 = pd.DataFrame(round_2_rows_data, columns=ciga_round_2_colnames)
    # We merge the ciga sheet to the asset list
    ciga_dependent_asset_list = unitas_asset_list[
        unitas_asset_list["ECO Eligibility"].str.contains("subject to ciga")
    ].copy()

    # We merge the ciga sheet to the asset list
    ciga_round_2_matched = ciga_dependent_asset_list.merge(
        ciga_round_2, how="inner", on=["Address Line 1", "Post Code"]
    )
    # Filter on just the properties that had no guarantee
    ciga_round_2_matched = ciga_round_2_matched[ciga_round_2_matched["Guarantee"] == "No"]

    # ECO Eligibility
    # not eligible              9227
    # failed ciga               2711
    # eco4 (subject to ciga)    2238
    # eco4 - passed ciga         901
    # gbis                       114
    # eco4                        91

    # We filter on the properties we're looking to re-survey
    unitas_properties_to_survey = unitas_asset_list[
        unitas_asset_list["ECO Eligibility"].isin(
            [
                "eco4 - passed ciga",
                "eco4"
            ]
        )
    ].copy()

    unitas_properties_to_survey = pd.concat(
        [
            unitas_properties_to_survey,
            ciga_round_2_matched[unitas_properties_to_survey.columns]
        ]
    )

    epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA="

    # We now retrieve the lastest EPC data
    epc_data = []
    for _, unitas_property in tqdm(unitas_properties_to_survey.iterrows(), total=len(unitas_properties_to_survey)):
        property_type, _ = get_property_type_and_built_form(property_meta=unitas_property, ha_name="HA50")

        full_address = unitas_property["matching_address"]

        searcher = SearchEpc(
            address1=str(unitas_property["HouseNo"]),
            postcode=unitas_property["matching_postcode"],
            auth_token=epc_api_key,
            os_api_key="",
            property_type=property_type,
            full_address=full_address,
            fast=True
        )
        # Force the skipping of estimating the EPC
        searcher.ordnance_survey_client.property_type = None
        searcher.ordnance_survey_client.built_form = None

        searcher.find_property(skip_os=True)
        if searcher.newest_epc is None:
            continue

        epc = {
            "asset_list_row_id": unitas_property["asset_list_row_id"],
            **searcher.newest_epc.copy()
        }

        epc_data.append(epc)

    epc_df = pd.DataFrame(epc_data)
    # Pull out just the columns we need
    epc_df = epc_df[
        [
            "asset_list_row_id",
            "address1", "postcode",
            "current-energy-efficiency",
            "current-energy-rating",
            "inspection-date",
            "transaction-type",
            "built-form"
        ]
    ]

    epc_df["EPC Rating"] = (
        epc_df["current-energy-efficiency"].astype(str) +
        epc_df["current-energy-rating"].astype(str)
    )

    # Merge onto the Unitas data:
    unitas_properties_to_survey_full = unitas_properties_to_survey.merge(
        epc_df[
            [
                "asset_list_row_id",
                "EPC Rating",
                "inspection-date",
                "transaction-type",
                "built-form"
            ]
        ],
        how="left",
        on="asset_list_row_id"
    )

    unitas_properties_to_survey_full["ECO Eligibility"] = unitas_properties_to_survey_full["ECO Eligibility"].replace(
        "eco4 (subject to ciga)", "eco4 - passed ciga, phase 2 check"
    )

    for col in ["EPC Rating", "inspection-date", "transaction-type", "built-form"]:
        unitas_properties_to_survey_full[col] = np.where(
            pd.isnull(unitas_properties_to_survey_full[col]),
            "No EPC found",
            unitas_properties_to_survey_full[col]
        )
        unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].fillna(
            "No EPC found"
        )
        unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].astype(str)

    unitas_properties_to_survey_full = unitas_properties_to_survey_full.rename(
        columns={
            "inspection-date": "Last EPC Inspection Date",
            "transaction-type": "Last EPC Reason",
            "built-form": "Last EPC Built Form",
        }
    )

    # We now match to the survey outcomes
    unitas_survey_outcomes_workbook = openpyxl.load_workbook(
        "local_data/ha_data/UNITAS - survey outcomes 26.03.2024.xlsx"
    )
    unitas_survey_outcomes_worksheet = unitas_survey_outcomes_workbook["OUTCOMES"]
    unitas_outcomes_colnames = [cell.value for cell in unitas_survey_outcomes_worksheet[2]]
    outcomes_rows_data = []
    for row in unitas_survey_outcomes_worksheet.iter_rows(min_row=3, values_only=False):
        row_data = [cell.value for cell in row]  # This will get you the cell values
        outcomes_rows_data.append(row_data)

    unitas_outcomes = pd.DataFrame(outcomes_rows_data, columns=unitas_outcomes_colnames)
    unitas_outcomes = unitas_outcomes.rename(
        columns={
            "Notes                 (If 'no answer' under outcomes, have you checked around the property for access "
            "issues where possible?)": "Notes"
        }
    )

    unitas_outcomes["Postcode"].unique()
    eg1 = unitas_properties_to_survey_full[
        (unitas_properties_to_survey_full["Post Code"] == "ST6 6RF")
    ]
    eg1_outcomes = unitas_outcomes[
        (unitas_outcomes["Postcode"] == "ST6 6RF")
    ]

    # Merge outcomes onto properties to survey. Will probably have to do algorithmically
    full_asset_list["matching_postcode_nospace"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
    outcome_matching = []
    for _, outcome in tqdm(unitas_outcomes.iterrows(), total=len(unitas_outcomes)):
        # We search for the corresponding entry in the asset list
        postcode_lower = outcome["Postcode"].lower().strip().replace(" ", "")

        # Confirmed not in asset lsit
        # Filter asset list on postcode
        df = unitas_properties_to_survey_full[
            unitas_properties_to_survey_full["matching_postcode_nospace"].str.contains(postcode_lower)
        ]

        df = df[df["HouseNo"] == str(outcome["No."])]
        if df.empty:
            continue

        if df.shape[0] == 1:
            outcome_matching.append(
                {
                    "asset_list_row_id": df["asset_list_row_id"].values[0],
                    **outcome.to_dict()
                }
            )
            continue

        raise Exception("something went wrong")
    outcome_matching = pd.DataFrame(outcome_matching)

    # We can have duplicate matches, so we format the Date letter sent column and retrieve the newest outcome
    outcome_matching["Date letters sent"] = outcome_matching["Date letters sent"].str.lower()
    outcome_matching["Extracted Date"] = outcome_matching["Date letters sent"].str.extract(
        r'(?:w[./]c )(\d{2}\.\d{2}\.\d{4})')
    outcome_matching["Extracted Date"] = pd.to_datetime(outcome_matching["Extracted Date"], format='%d.%m.%Y')
    # We sort by asset_list_row_id and extracted date, and retrieve the newest
    outcome_matching = outcome_matching.sort_values(["asset_list_row_id", "Extracted Date"], ascending=[True, False])

    # Some properties will have multiple outcomes - for these, we re-format
    outcome_matching_grouped = []
    for asset_list_row_id, grouped_data in outcome_matching.groupby("asset_list_row_id"):
        if grouped_data.shape[0] == 1:
            outcome_matching_grouped.append(
                {
                    "Number of previous visits": 1,
                    **grouped_data.to_dict("records")[0]
                }
            )
            continue
        if grouped_data.shape[0] == 2:
            newest_visit = grouped_data.head(1)
            oldest_visit = grouped_data.tail(1)[['Outcomes', 'Surveyor', 'Notes', 'Date letters sent']].add_suffix(
                " second visit")
            to_append = {
                "Number of previous visits": 2,
                **newest_visit.to_dict("records")[0],
                **oldest_visit.to_dict("records")[0]
            }
            outcome_matching_grouped.append(to_append)
        else:
            raise Exception("something went wrong")

    outcome_matching_grouped = pd.DataFrame(outcome_matching_grouped)

    unitas_properties_to_survey_with_outcomes = unitas_properties_to_survey_full.merge(
        outcome_matching_grouped, how="left", on="asset_list_row_id"
    )
    unitas_properties_to_survey_with_outcomes["Number of previous visits"] = (
        unitas_properties_to_survey_with_outcomes["Number of previous visits"].fillna(0)
    )

    # Store as an excel
    unitas_properties_to_survey_with_outcomes.to_excel("Unitas - phase 2 properties to Survey.xlsx")

    unitas_properties_to_survey_with_outcomes["Last EPC Built Form"].value_counts()


def app():
    """
    This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107.
    Only HA 6 has surveys
    :return:
    """

    # Determines if we want to use the cached data in s3
    use_cache = True
    # Determines if we want to perform the data pull
    pull_data = False
    # Override to re-build all inputs
    rebuild_inputs = False

    # List all of the data in the folder
    directories = [str(file) for entry in DATA_FOLDER.iterdir() if entry.is_dir()
                   for file in entry.iterdir() if file.suffix == '.xlsx']
    # Grab the December HA figures filepath
    december_figures_filepath = "local_data/ha_data/HA_December_figures.csv"

    # Add in:
    priority_has = [
        "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24",
        "HA25", "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54",
        "HA56", "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42",
        # Added as of March 18th
        "HA44", "HA45", "HA51", "HA52", "HA17", "HA5", "HA20",
        # New HAS
        "HAXX", "HAXXX",
    ]
    # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come
    # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE],
    # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE], 31 [DONE], 54 [DONE]
    #
    # Consider for ECO4:
    # HA 70 - have to merge ECO3 list though,
    # HA17 has LOTs of assets, but the asset list is a mess
    # HA53 but has EPCs done
    # Consider for GBIS:
    # Ignore for now:
    # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in
    # Filter down the directories to only the priority HAs
    directories = [d for d in directories if d.split("/")[2] in priority_has]

    loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs)
    loader.load()
    loader.ha_facts_and_figures()

    # import pickle
    # with open("ha_analysis_data_temp.pkl", "wb") as f:
    #     pickle.dump(loader, f)
    # import pickle
    # with open("ha_analysis_data_temp.pkl", "rb") as f:
    #     loader = pickle.load(f)

    forecast_remaining_sales(loader)

    # Functions to produce the final output lol...
    # fml_data_pull(loader)  # If we need to pull EPC data
    fml_analysis(loader)
    create_final_report()

    # Adhoc - for HA16, get the properties that still need a CIGA check
    # asset_list_ha16 = loader.data["HA16"]["asset_list"].copy()
    # ha_16_need_ciga = asset_list_ha16[
    #     asset_list_ha16["ECO Eligibility"].str.contains("subject to ciga")
    # ]
    # completed_cigas = loader.data["HA16"]["ciga_list"].copy()
    # # Store the results
    # ha_16_need_ciga.to_csv("ha16_need_ciga.csv")
    # completed_cigas.to_csv("ha16_completed_cigas.csv")
    #
    # # Adhoc - look at the current pipeline and identify how many dormant, CIGA dependent properties there are for
    # # live projects
    #
    # # Read excel
    # orderbook_filepath = "local_data/ha_data/Warmfront HA client order book overview_20240129.xlsx"
    # orderbook_workbook = openpyxl.load_workbook(orderbook_filepath)
    # orderbook_sheet = orderbook_workbook["Contractual Info"]
    # orderbook_colnames = [cell.value for cell in orderbook_sheet[1]]
    #
    # rows = []
    # for row in orderbook_sheet.iter_rows(min_row=2, values_only=False):
    #     row_data = [cell.value for cell in row]  # This will get you the cell values
    #     rows.append(row_data)
    #
    # orderbook = pd.DataFrame(rows, columns=orderbook_colnames)
    # live_orderbook = orderbook[orderbook["Live, New, or Historic?"] == "LIVE"].copy()
    # live_orderbook['Redacted HA'] = live_orderbook['Redacted HA'].str.replace(" ", "")
    #
    # dormant_properties = []
    # missed_has = []
    # for _, customer in live_orderbook.iterrows():
    #     if customer['Redacted HA'] not in loader.data.keys():
    #         missed_has.append(customer['Redacted HA'])
    #         continue
    #     asset_list = loader.data[customer['Redacted HA']]["asset_list"].copy()
    #     survey_list = loader.data[customer['Redacted HA']]["survey_list"].copy()
    #     # Remove sold
    #     if not survey_list.empty:
    #         survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])]
    #         asset_list = asset_list.merge(
    #             survey_list[["asset_list_row_id", "installation_status"]],
    #             how="left",
    #             on="asset_list_row_id"
    #         )
    #         # Anything that has an installation has gone to installation, and therefore is not remaining
    #         asset_list = asset_list[pd.isnull(asset_list["installation_status"])]
    #         asset_list = asset_list.drop(columns=["installation_status"])
    #
    #     # We pull out the properties that need a CIGA check
    #     need_ciga = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to ciga)"]
    #     need_archetype = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to archetype)"]
    #     need_ciga_and_archetype = asset_list[
    #         asset_list["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)"
    #         ]
    #
    #     dormant_properties.append(
    #         {
    #             "HA Name": customer['Redacted HA'],
    #             "Need CIGA": need_ciga.shape[0],
    #             "Need Archetype": need_archetype.shape[0],
    #             "Need CIGA and Archetype": need_ciga_and_archetype.shape[0]
    #         }
    #     )
    #
    # dormant_properties = pd.DataFrame(dormant_properties)
    # totals = dormant_properties.sum()
    # totals["HA Name"] = "Total"
    #
    # dormant_properties = pd.concat([dormant_properties, totals.to_frame().T])
    # dormant_properties.to_csv("dormant_properties.csv")
    #
    # loader.december_figures["ECO4 remaining"].sum()
    # december_figures = loader.december_figures.copy()
    # december_figures["ECO4 remaining"] = np.where(
    #     december_figures["ECO4 remaining"] < 0,
    #     0,
    #     december_figures["ECO4 remaining"]
    # )
    # december_figures["ECO4 remaining"].sum()