Model/etl/customers/stonewater/shdf_3_clustering.py

import json
from tqdm import tqdm
import os
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
import urllib.parse
import requests
from datetime import datetime
from scipy import stats

from fuzzywuzzy import fuzz
import numpy as np
import pandas as pd
import time
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
    save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances_argmin_min

load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")

# We create a MAP of uprns, for EPCs that didn't give use the UPRN
missing_uprn_map = [
    # This is a map from internal_id to UPRN, for properties where we do have an EPC, but we don't have
    # a uprn
    # 1 Church Street, Alfreton, DE55 7AH
    {"internal_id": 78, "mapped_uprn": None},  # Doesn't seem to exist any more
    # 1 Granville Road, Luton, LU1 1PA
    {"internal_id": 315, "mapped_uprn": 100080148856},
    # 11 College Street, Birstall, Batley, WF17 9HF
    # The EPC record is for 11 and 11a
    {"internal_id": 1090, "mapped_uprn": 83190440},
    # 11a College Street, Birstall, Batley, WF17 9HF
    {"internal_id": 1092, "mapped_uprn": 83143766},
    # Flat 5 Friars Street, Hereford, HR4 0AS
    # TODO: Check this
    {"internal_id": 1384, "mapped_uprn": 200002600892},
    # This UPRN is for 5 Friars Court, which is a flat
    # Flat 7 Friars Street, Hereford, HR4 0AS
    # TODO: Check this
    {"internal_id": 1385, "mapped_uprn": 200002600894},
    # This UPRN is for 7 Friars Court, which is a flat
    # 1 Waverley Street, Dudley, DY2 0YE
    {"internal_id": 3349, "mapped_uprn": 90022438},
    # 5 Brighton Road, Burgh Heath, Tadworth, KT20 6BQ
    # TODO: Check this
    # This UPRN is for 5 Copthorne, Brighton Road, Burgh Heath, KT20 6BQ, which is a flat
    {"internal_id": 5027, "mapped_uprn": 100062145273},
    # Room 1, 21 Coxford Road, Southampton, SO16 5FG
    # This is for 21 Coxford Road
    {"internal_id": 5554, "mapped_uprn": 100060692392},

]
missing_uprn_map = pd.DataFrame(missing_uprn_map)

internal_id_epcs_to_drop = [315, 1384, 1385, 3349]


def remove_commas_and_full_stops(input_string: str) -> str:
    """
    Removes commas and full stops from the input string.

    Args:
    input_string (str): The string from which to remove commas and full stops.

    Returns:
    str: The string with commas and full stops removed.
    """
    return input_string.replace(',', '').replace('.', '')


def get_places_with_retry(searcher, max_retries=5, wait_time=2):
    """
    Tries to call the get_places_api method up to max_retries times,
    with a wait_time interval between attempts in case of failure.

    Args:
    searcher (object): The searcher object with the ordnance_survey_client.
    max_retries (int): Maximum number of retry attempts.
    wait_time (int): Wait time in seconds between retries.

    Returns:
    result: The result from the get_places_api method or None if all attempts fail.
    """
    for attempt in range(max_retries):
        try:
            response = searcher.ordnance_survey_client.get_places_api()
            status = response.get("status")
            if status == 200:
                return response  # Return the result if successful
            else:
                print(f"Attempt {attempt + 1} failed with status code: {status}")
        except Exception as e:
            print(f"Attempt {attempt + 1} failed with error: {e}")

        if attempt < max_retries - 1:
            print(f"Retrying in {wait_time} seconds...")
            time.sleep(wait_time)

    print(f"All {max_retries} attempts failed.")
    return None


def app():
    """
    This script handles the preparation of the data from Stonewater, to archetype a collection
    of 5.3k properties and reduce that down to a representative set of 450 properties.

    Here, we prepare the input data for clustering
    :return:
    """

    # TODO: Temp read from local machine - move to s3
    # asset_list = pd.read_excel(
    #     "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
    # )

    asset_list = read_excel_from_s3(
        file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
        bucket_name="retrofit-data-dev",
        header_row=4
    )

    # Drop the bottom 4 rows, which are completely missing
    asset_list = asset_list.head(-4)

    # Keep just the columns we're interested in
    asset_list = asset_list[
        [
            "Osm. ID",
            "Org. ref.",
            "Postcode",
            "House no",
            "Name",
            "Address line 2",
            "City/Town",
            "County",
            "Address ID",  # This is not uprn
        ]
    ].rename(
        columns={
            "Osm. ID": "internal_id",
            "Org. ref.": "customer_asset_id",
            "Postcode": "postcode",
            "House no": "house_number",
            "Name": "address1",
            "Address line 2": "address2",
            "City/Town": "city_town",
            "County": "county",
            "Address ID": "external_address_id",
        }
    )

    # Create full address
    asset_list["full_address"] = np.where(
        ~pd.isnull(asset_list["address2"]),
        (
            asset_list["address1"] + ", " +
            asset_list["address2"] + ", " +
            asset_list["city_town"].str.title() + ", " +
            # asset_list["county"] + ", " +
            asset_list["postcode"]
        ),
        asset_list["address1"] + ", " +
        asset_list["city_town"].str.title() + ", " +
        # asset_list["county"] + ", " +
        asset_list["postcode"]
    )

    if pd.isnull(asset_list["full_address"]).sum():
        raise ValueError("Missing full addresses")

    # Pull in the data
    # This data has already been pulled as much as it can be, so we retrieve the existing extraction from S3

    # Perform an initial pull without ordnance survey data
    # epc_data = []
    # older_epc_data = {}
    #
    # for row_number, asset in tqdm(asset_list.iterrows(), total=len(asset_list)):
    #     searcher = SearchEpc(
    #         address1=str(asset["address1"]),
    #         postcode=str(asset["postcode"]),
    #         auth_token=EPC_AUTH_TOKEN,
    #         os_api_key="",
    #         full_address=str(asset["full_address"]),
    #         uprn=asset.get("uprn", None),
    #     )
    #     searcher.find_property(skip_os=True)
    #
    #     if searcher.newest_epc is None:
    #         continue
    #
    #     epc_data.append(
    #         {
    #             "internal_id": asset["internal_id"],
    #             **searcher.newest_epc
    #         }
    #     )
    #
    #     if searcher.older_epcs is not None:
    #         older_epc_data[asset["internal_id"]] = searcher.older_epcs
    #
    # # Store to S3
    # save_data_to_s3(
    #     data=json.dumps(epc_data),
    #     s3_file_name="customers/Stonewater/clustering/epc_data.json",
    #     bucket_name="retrofit-data-dev"
    # )
    #
    # save_data_to_s3(
    #     data=json.dumps(older_epc_data),
    #     s3_file_name="customers/Stonewater/clustering/old_epc_data.json",
    #     bucket_name="retrofit-data-dev"
    # )
    # We read this directly from s3
    epc_data = json.loads(
        read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name="customers/Stonewater/clustering/epc_data.json"
        )
    )

    older_epc_data = json.loads(
        read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
        )
    )

    # Perform a comparison between the EPC address and the asset list address, just to double check

    epc_data_df = pd.DataFrame(epc_data)
    address_comparison = (
        asset_list[["internal_id", "full_address", "postcode", "house_number", "address1"]].merge(
            epc_data_df[["internal_id", "address", "postcode", "address1"]].rename(
                columns={
                    "address": "epc_address",
                    "postcode": "epc_postcode",
                    "address1": "epc_address1"
                }
            ),
            how="inner",
            on="internal_id"
        )
    )

    # Produce a metric, showing the matching confidence between the two
    address_comparison["epc_extracted_house_number"] = address_comparison["epc_address1"].apply(
        lambda x: SearchEpc.get_house_number(x)
    )

    address_comparison["house_numbers_match"] = (
        address_comparison["house_number"].str.lower() == address_comparison["epc_extracted_house_number"].str.lower()
    )

    # We also produce a address similarity metric
    # We convert the strings to lower and remove common punctuation

    address_comparison["address_similarity_score"] = address_comparison.apply(
        lambda x: fuzz.ratio(
            remove_commas_and_full_stops(x["address1"].lower()),
            remove_commas_and_full_stops(x["epc_address1"].lower())
        ),
        axis=1
    )

    address_comparison = address_comparison.sort_values("address_similarity_score", ascending=True)
    address_comparison = address_comparison[
        ["internal_id", "full_address", "epc_address", "address_similarity_score", "house_numbers_match"]
    ]

    # Anything with less than a 90 similarity score, let's do again
    needs_ordnance_survey = address_comparison[
        (address_comparison["address_similarity_score"] <= 90) |
        (~address_comparison["house_numbers_match"])
        ].copy()

    is_ok = address_comparison[~address_comparison["internal_id"].isin(needs_ordnance_survey["internal_id"])]
    is_ok = is_ok.sort_values("address_similarity_score", ascending=True)

    os_data_pull_asset_list = asset_list[
        ~asset_list["internal_id"].isin(is_ok["internal_id"].values)
    ].copy()

    # We have already done a partial pull of the Ordnance survey data so we can skip some of the records
    # os_most_relevant_1 = json.loads(
    #     read_from_s3(
    #         bucket_name="retrofit-data-dev",
    #         s3_file_name="customers/Stonewater/clustering/os_most_relevant_1.json"
    #     )
    # )
    #
    # os_most_relevant_2 = json.loads(
    #     read_from_s3(
    #         bucket_name="retrofit-data-dev",
    #         s3_file_name="customers/Stonewater/clustering/os_most_relevant_2.json"
    #     )
    # )
    #
    # fetched_internal_ids = (
    #     [x["internal_id"] for x in os_most_relevant_1] + [x["internal_id"] for x in os_most_relevant_2]
    # )
    #
    # # We remove any ids we've already fetched
    # os_data_pull_asset_list = os_data_pull_asset_list[
    #     ~os_data_pull_asset_list["internal_id"].isin(fetched_internal_ids)
    # ]
    #
    # # Our OK EPC data (is_ok) + ordnance survey fetched data + the data we need to fetch should equal the total
    # # number of assets
    # assert len(is_ok) + len(fetched_internal_ids) + len(os_data_pull_asset_list) == len(asset_list)

    os_data_pull_asset_list = os_data_pull_asset_list.reset_index(drop=True)

    # For each of these records, we pull the OS data
    # ORDNANCE_SURVEY_API_KEY = ""  # This API key is a temp key which I have copied locally
    # os_most_relevant = []
    # os_all = {}
    # errors = []
    # for _, asset in tqdm(os_data_pull_asset_list.iterrows(), total=len(os_data_pull_asset_list)):
    #     # Calls are throttled to 50 per minute in development mode, so lets just slow this down
    #     time.sleep(2)
    #
    #     searcher = SearchEpc(
    #         address1=str(asset["address1"]),
    #         postcode=str(asset["postcode"]),
    #         auth_token=EPC_AUTH_TOKEN,
    #         os_api_key=ORDNANCE_SURVEY_API_KEY,
    #         full_address=str(asset["full_address"]),
    #         uprn=asset.get("uprn", None),
    #     )
    #     searcher.ordnance_survey_client.full_address = asset["full_address"]
    #     # Attempt to get places data with retry logic
    #     result = get_places_with_retry(searcher)
    #
    #     if result:
    #         # Get the most relevant response
    #         os_most_relevant.append(
    #             {
    #                 "internal_id": asset["internal_id"],
    #                 **searcher.ordnance_survey_client.most_relevant_result
    #             }
    #         )
    #
    #         # Also keep the best 100 results
    #         os_all[asset["internal_id"]] = searcher.ordnance_survey_client.results
    #     else:
    #         # Record the internal_id of the asset that failed
    #         print("Error for address: " + asset["full_address"])
    #         errors.append(asset["internal_id"])

    # Store to S3
    # save_data_to_s3(
    #     data=json.dumps(os_most_relevant),
    #     s3_file_name="customers/Stonewater/clustering/os_most_relevant_3.json",
    #     bucket_name="retrofit-data-dev"
    # )
    #
    # save_data_to_s3(
    #     data=json.dumps(os_all),
    #     s3_file_name="customers/Stonewater/clustering/os_all_3.json",
    #     bucket_name="retrofit-data-dev"
    # )
    #
    # save_data_to_s3(
    #     data=json.dumps(errors),
    #     s3_file_name="customers/Stonewater/clustering/errors_3.json",
    #     bucket_name="retrofit-data-dev"
    # )

    # We now collate all of the data for the following steps:
    # 1) Checking the retrieve ordnance survey data against ordnance survey data
    # 2) A second round of querying the EPC api to find the EPC data, in case we retrieve something using uprn
    # 3) Predicting the EPC data for the properties we have no data for
    # 4) Retrieveing additional data against the internal_id
    # 5) Creation of final dataset for clustering

    os_most_relevant = []
    os_all = {}
    for i in ["1", "2", "3"]:
        most_relevant_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
        )
        os_most_relevant.extend(json.loads(most_relevant_segment))
        os_all_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
        )
        os_all = {**os_all, **json.loads(os_all_segment)}

    os_most_relevant = pd.DataFrame(os_most_relevant)

    os_address_comparison = os_data_pull_asset_list[
        ["internal_id", "full_address", "postcode", "house_number", "address1"]
    ].merge(
        os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
        how="inner",
        on="internal_id"
    )

    # Compare house number
    # Check for records where the postcode doesn't match
    os_address_comparison["postcodes_match"] = (
        os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower()
    )

    # extract it from ADDRESS
    os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply(
        lambda x: SearchEpc.get_house_number(x)
    )

    # Compare house number
    os_address_comparison["house_numbers_match"] = (
        os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower()
    )

    # String similarity
    os_address_comparison["address_similarity_score"] = os_address_comparison.apply(
        lambda x: fuzz.ratio(
            remove_commas_and_full_stops(x["full_address"].lower()),
            remove_commas_and_full_stops(x["ADDRESS"].lower())
        ),
        axis=1
    )

    os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True)

    problematic = os_address_comparison.copy()

    problematic = problematic[
        (problematic["address_similarity_score"] <= 80) |
        (~problematic["house_numbers_match"]) |
        (~problematic["postcodes_match"])
        ]

    # TODO: We'll label these problematic records as problematic, in the final output

    # different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True)

    ORDNANCE_SURVEY_API_KEY = ""  # This API key is a temp key which I have copied locally
    problematic_os = []
    problematic_os_all = {}
    problematic_errors = []
    for _, row in tqdm(problematic.iterrows(), total=len(problematic)):
        # Let's just do a backup pull - we're now using LPI too
        time.sleep(2)
        backup_searher = SearchEpc(
            address1=row["address1"],
            postcode=row["postcode"],
            auth_token=EPC_AUTH_TOKEN,
            os_api_key=ORDNANCE_SURVEY_API_KEY,
            uprn=None,
        )
        # Attempt to get places data with retry logic
        result = get_places_with_retry(backup_searher)

        if result:
            # Get the most relevant response
            problematic_os.append(
                {
                    "internal_id": row["internal_id"],
                    **backup_searher.ordnance_survey_client.most_relevant_result
                }
            )

            # Also keep the best 100 results
            problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results
        else:
            # Record the internal_id of the asset that failed
            print("Error for address: " + row["full_address"])
            problematic_errors.append(row["internal_id"])

    # Store to S3
    # save_data_to_s3(
    #     data=json.dumps(problematic_os),
    #     s3_file_name="customers/Stonewater/clustering/problematic_os.json",
    #     bucket_name="retrofit-data-dev"
    # )
    #
    # save_data_to_s3(
    #     data=json.dumps(problematic_os_all),
    #     s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
    #     bucket_name="retrofit-data-dev"
    # )
    #
    # save_data_to_s3(
    #     data=json.dumps(problematic_errors),
    #     s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
    #     bucket_name="retrofit-data-dev"
    # )

    # Next steps: We should collate all of the data and produce 1 big dataset

    problematic_os_df = pd.DataFrame(problematic_os)
    problematic_address_comparison = problematic[["internal_id", "full_address", "postcode", "house_number"]].merge(
        problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
        how="inner",
        on="internal_id"
    )

    problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1]
    problematic_address_comparison["postcodes_match"] = (
        problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[
        "OS_POSTCODE"].str.lower()
    )

    problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply(
        lambda x: fuzz.ratio(
            remove_commas_and_full_stops(x["full_address"].lower()),
            remove_commas_and_full_stops(x["ADDRESS"].lower())
        ),
        axis=1
    )
    problematic_address_comparison = problematic_address_comparison.sort_values(
        "match_similarity_score", ascending=True
    )

    # let's do a house number extraction
    problematic_address_comparison["extracted_house_number"] = problematic_address_comparison.apply(
        lambda x: SearchEpc.get_house_number(x["ADDRESS"], x["OS_POSTCODE"]), axis=1
    )

    problematic_address_comparison["house_numbers_different"] = (
        problematic_address_comparison["house_number"].str.lower().str.split(",").str[0].str.split(" ").str[0] !=
        problematic_address_comparison[
            "extracted_house_number"].str.lower()
    )

    # We perform a final check
    # Take anything where the postcodes don't match, where the house numbers are different and the match similarity
    # is less than 90, or the match similarity is less than 80
    final_check = problematic_address_comparison[
        (~problematic_address_comparison["postcodes_match"])
    ]
    final_check = final_check.sort_values("match_similarity_score", ascending=False)
    final_check = final_check.reset_index(drop=True)

    final_best_matches = []
    no_matches = []
    for _, row in final_check.iterrows():
        os_data = problematic_os_all[row["internal_id"]]
        os_data = pd.DataFrame(
            [x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
        )

        if ("POSTCODE_LOCATOR" in os_data.columns) and ("POSTCODE" in os_data.columns):
            os_data["postcode"] = np.where(
                ~pd.isnull(os_data["POSTCODE"]),
                os_data["POSTCODE"],
                os_data["POSTCODE_LOCATOR"]
            )
        elif "POSTCODE" in os_data.columns:
            os_data["postcode"] = os_data["POSTCODE"]
        else:
            os_data["postcode"] = os_data["POSTCODE_LOCATOR"]
        os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
        if os_data.shape[0] >= 1:
            final_best_matches.append(
                {
                    "internal_id": row["internal_id"],
                    **os_data.iloc[0].to_dict()
                }
            )
        else:
            no_matches.append(
                {
                    "internal_id": row["internal_id"],
                    "full_address": row["full_address"],
                    "postcode": row["postcode"]
                }
            )

    no_matches = pd.DataFrame(no_matches)

    # Data to be confirmed
    from etl.customers.stonewater.no_matches import no_matches
    no_matches_to_export = pd.DataFrame(no_matches)
    no_matches_to_export = asset_list.merge(
        no_matches_to_export[["internal_id", "Note"]],
        how="inner",
        on="internal_id"
    ).rename(
        columns={
            "internal_id": "Osm. ID",
            "customer_asset_id": "Org. ref.",
            "external_address_id": "Address ID",
        }
    )
    no_matches_to_export.to_excel("Stonewater - addresses with no matches.xlsx", index=False)

    # We also confirm final_best_matches
    final_best_matches_df = pd.DataFrame(final_best_matches)[
        ["internal_id", "ADDRESS", "UPRN"]
    ].rename(
        columns={
            "ADDRESS": "Ordnance Survey Address - same postcode (best match)",
            "UPRN": "UPRN - same postcode (best match)"
        }
    )
    # We also get their original match
    final_best_matches_df = final_best_matches_df.merge(
        problematic[["internal_id", "ADDRESS", "UPRN"]].rename(
            columns={
                "ADDRESS": "Ordnance Survey Address - best possible match",
                "UPRN": "UPRN - best possible match"
            }
        ),
        how="inner",
        on="internal_id"
    )

    # merge on the original data
    final_best_matches_df = asset_list.merge(
        final_best_matches_df,
        how="inner",
        on="internal_id"
    ).rename(
        columns={
            "internal_id": "Osm. ID",
            "customer_asset_id": "Org. ref.",
            "external_address_id": "Address ID",
        }
    )

    # "Osm. ID": "internal_id",
    # "Org. ref.": "customer_asset_id",
    # "Postcode": "postcode",
    # "House no": "house_number",
    # "Name": "address1",
    # "Address line 2": "address2",
    # "City/Town": "city_town",
    # "County": "county",
    # "Address ID": "external_address_id",


def filter_os_data(p_os_data, p_os_data_all, udprn, is_flat):
    if udprn is None:
        p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
        if is_flat:
            p_os_data_all = p_os_data_all[p_os_data_all["CLASSIFICATION_CODE"] == "RD06"]
            return p_os_data_all.head(1)

        return p_os_data_all.head(1)

    final_os_data = p_os_data[p_os_data["UDPRN"] == udprn]
    if final_os_data.empty:
        p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
        final_os_data = p_os_data_all[p_os_data_all["UDPRN"].astype(str) == udprn]

    return final_os_data


def compile_data():
    """
    Various data sources have been produced to create the final data source for Stonewater.
    This function combines them
    :return:
    """
    ########################################################################
    # Read in data
    ########################################################################
    # asset_list = read_excel_from_s3(
    #     file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
    #     bucket_name="retrofit-data-dev",
    #     header_row=4
    # )
    #
    # udprn_data = read_excel_from_s3(
    #     file_key="customers/Stonewater/UDPRN updated RA Sample for 5 year programme.xlsx",
    #     bucket_name="retrofit-data-dev",
    #     header_row=0
    # )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})

    asset_list = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
        header=4
    )

    udprn_data = pd.read_excel(
        "/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
    )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
    udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
    udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)

    asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
    asset_list = asset_list.rename(columns={"UDPRN": "udprn"})

    # Read in the lookups
    uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
    )))

    uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
    )))
    uprn_lookup_2 = uprn_lookup_2.rename(
        columns={
            "epc_address": "standardised_address",
            "epc_postcode": "standardised_postcode"
        }
    )

    # concat
    uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])

    # TODO: Read in UPRNs or UDPRN
    #       UPRN LOOKUPS TO READ IN: address_uprn_udprn_lookup, address_uprn_udprn_lookup_2

    epc_data = json.loads(
        read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name="customers/Stonewater/clustering/epc_data.json"
        )
    )
    epc_data = pd.DataFrame(epc_data)

    # We drop come EPCS
    epc_data = epc_data[~epc_data["internal_id"].isin(internal_id_epcs_to_drop)]

    # This we can use to produce additional variables such as number of old surveys
    # older_epc_data = json.loads(
    #     read_from_s3(
    #         bucket_name="retrofit-data-dev",
    #         s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
    #     )
    # )
    # older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}

    # This is the first ordnance survey data pull
    os_most_relevant_1 = []
    os_all_1 = {}
    for i in tqdm(["1", "2", "3"]):
        most_relevant_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
        )
        os_most_relevant_1.extend(json.loads(most_relevant_segment))
        os_all_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
        )
        os_all_1 = {**os_all_1, **json.loads(os_all_segment)}

    os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)

    # This is the second ordnance survey data pull
    os_most_relevant_2 = read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="customers/Stonewater/clustering/problematic_os.json"
    )
    os_most_relevant_2 = json.loads(os_most_relevant_2)
    os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)

    os_all_2 = read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
    )
    os_all_2 = json.loads(os_all_2)

    ########################################################################
    # Prepare asset list
    ########################################################################
    # TODO: Merge on UPRNs

    # Keep just the columns we're interested in
    asset_list = asset_list[
        [
            "Osm. ID",
            "Org. ref.",
            "Postcode",
            "House no",
            "Name",
            "Address line 2",
            "City/Town",
            "County",
            "Address ID",  # This is not uprn
            "udprn"
        ]
    ].rename(
        columns={
            "Osm. ID": "internal_id",
            "Org. ref.": "customer_asset_id",
            "Postcode": "postcode",
            "House no": "house_number",
            "Name": "address1",
            "Address line 2": "address2",
            "City/Town": "city_town",
            "County": "county",
            "Address ID": "external_address_id",
        }
    )

    # Create full address
    asset_list["full_address"] = np.where(
        ~pd.isnull(asset_list["address2"]),
        (
            asset_list["address1"] + ", " +
            asset_list["address2"] + ", " +
            asset_list["city_town"].str.title() + ", " +
            # asset_list["county"] + ", " +
            asset_list["postcode"]
        ),
        asset_list["address1"] + ", " +
        asset_list["city_town"].str.title() + ", " +
        # asset_list["county"] + ", " +
        asset_list["postcode"]
    )

    if pd.isnull(asset_list["full_address"]).sum():
        raise ValueError("Missing full addresses")

    # Merge on UDPRN

    asset_list = asset_list.merge(
        uprn_lookup.drop(columns=["udprn"]), how="left", on=["internal_id", "external_address_id"]
    )

    # This is everything without a uprn

    # Quick check to see if we have os data for every property that doesn't have an EPC
    without_epc = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]

    os_most_relevant_1_internal_ids = os_most_relevant_1["internal_id"].tolist()
    os_most_relevant_2_internal_ids = os_most_relevant_2["internal_id"].tolist()

    missing_os_data = []
    for _, x in without_epc.iterrows():
        # We would prioritise the data pulled the second time around

        internal_id = x["internal_id"]
        if internal_id in os_most_relevant_2_internal_ids:
            continue

        if internal_id in os_most_relevant_1_internal_ids:
            continue

        missing_os_data.append(internal_id)

    if len(missing_os_data):
        raise Exception("We don't have SOME data for each internal_id")

    # Let's create a lookup table of internal_id, external_address_id, udprn, uprn, standardised_address
    address_uprn_udprn_lookup = []
    for _, x in without_epc.iterrows():
        if pd.isnull(x["UDPRN"]):
            continue
        udprn = str(int(x["UDPRN"]))
        internal_id = x["internal_id"]

        is_flat = "flat" in x["address1"].lower()

        # Get the OS data
        final_os_data = pd.DataFrame()
        if internal_id in os_most_relevant_1_internal_ids:
            p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
            p_os_data_all = os_all_1[str(internal_id)]
            final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)

        if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
            p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
            p_os_data_all = os_all_2[str(internal_id)]

            final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)

        if final_os_data.empty:
            continue

        if final_os_data.shape[0] != 1:
            if final_os_data["UPRN"].nunique() > 1:
                raise Exception("Investigate me")

        address_uprn_udprn_lookup.append(
            {
                "internal_id": internal_id,
                "external_address_id": x["external_address_id"],
                "udprn": udprn,
                "uprn": final_os_data["UPRN"].values[0],
                "standardised_address": final_os_data["ADDRESS"].values[0],
                "standardised_postcode": final_os_data["POSTCODE"].values[0]
            }
        )

    # Store this lookup
    # save_data_to_s3(
    #     data=json.dumps(address_uprn_udprn_lookup),
    #     s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json",
    #     bucket_name="retrofit-data-dev"
    # )

    address_uprn_udprn_lookup = pd.DataFrame(address_uprn_udprn_lookup)
    missed = asset_list[~asset_list["internal_id"].isin(address_uprn_udprn_lookup["internal_id"].values)]

    address_comparison = (
        asset_list[
            ["internal_id", "external_address_id", "UDPRN", "full_address", "postcode", "house_number", "address1"]
        ].merge(
            epc_data[["internal_id", "address", "postcode", "address1", "uprn"]].rename(
                columns={
                    "address": "epc_address",
                    "postcode": "epc_postcode",
                    "address1": "epc_address1"
                }
            ),
            how="inner",
            on="internal_id"
        )
    )

    address_comparison["address_similarity_score"] = address_comparison.apply(
        lambda x: fuzz.ratio(
            remove_commas_and_full_stops(x["address1"].lower() + x["postcode"].lower()),
            remove_commas_and_full_stops(x["epc_address1"].lower() + x["epc_postcode"].lower())
        ),
        axis=1
    )
    address_comparison = address_comparison.sort_values("address_similarity_score", ascending=False)
    # Cond
    confident = address_comparison[address_comparison["address_similarity_score"] >= 95]
    low_confidence = address_comparison[address_comparison["address_similarity_score"] < 95].copy()

    lookup_2 = confident[
        [
            'internal_id', 'external_address_id', 'UDPRN', 'uprn',
            'epc_address', 'epc_postcode']
    ].rename(columns={"UDPRN": "udprn"})

    # Store in S3
    # save_data_to_s3(
    #     data=json.dumps(lookup_2.to_dict("records")),
    #     s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json",
    #     bucket_name="retrofit-data-dev"
    # )

    # Need to deal with the low confidence records
    low_confidence_asset_list = asset_list[asset_list["internal_id"].isin(low_confidence["internal_id"])]
    for _, x in low_confidence_asset_list.iterrows():
        udprn = str(int(x["UDPRN"]))
        internal_id = x["internal_id"]
        # Get the OS data
        final_os_data = pd.DataFrame()
        if internal_id in os_most_relevant_1_internal_ids:
            p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
            p_os_data_all = os_all_1[str(internal_id)]
            final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)

        if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
            p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
            p_os_data_all = os_all_2[str(internal_id)]

            final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)

    # For the EPC data, some of them are missing UPRN
    epc_data = epc_data.merge(missing_uprn_map, how="left", on="internal_id")
    epc_data["uprn"] = np.where(
        epc_data["uprn"] == "",
        epc_data["mapped_uprn"],
        epc_data["uprn"]
    )
    epc_data = epc_data.drop(columns=["mapped_uprn"])

    # Once we have UPRNs, we might want to pull in the EPC data again
    # epc_data_with_uprn = []
    # older_epc_data_with_uprn = {}
    #
    # for row_number, asset in tqdm(asset_list.iterrows(), total=len(asset_list)):
    #     searcher = SearchEpc(
    #         address1=str(asset["address1"]),
    #         postcode=str(asset["postcode"]),
    #         auth_token=EPC_AUTH_TOKEN,
    #         os_api_key="",
    #         full_address=str(asset["full_address"]),
    #         uprn=asset["uprn"]
    #     )
    #     searcher.find_property(skip_os=True)
    #
    #     if searcher.newest_epc is None:
    #         continue
    #
    #     epc_data_with_uprn.append(
    #         {
    #             "internal_id": asset["internal_id"],
    #             **searcher.newest_epc
    #         }
    #     )
    #
    #     if searcher.older_epcs is not None:
    #         older_epc_data_with_uprn[asset["internal_id"]] = searcher.older_epcs

    # We now get the remaining properties
    # TODO: We might want to use epc_data_with_uprn
    remaining_properties = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]

    # We estimate the data
    final_epcs = []
    for _, p in remaining_properties.iterrows():
        internal_id = p["internal_id"]
        uprn = p["UPRN"]

        if internal_id in os_most_relevant_1_internal_ids:
            p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id].to_dict("records")[0]
            p_os_full = os_all_1[str(internal_id)]
        else:
            p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id].to_dict("records")[0]
            p_os_full = os_all_2[str(internal_id)]
        p_os_full = pd.DataFrame(
            [x["DPA"] if "DPA" in x else x["LPI"] for x in p_os_full]
        )

        # TODO: Add this back in
        # When we have this
        if p["uprn"] != p_os_data["UPRN"]:
            # Get it from the older data
            filtered = p_os_full[p_os_full["UPRN"] == p["uprn"]]
            p_os_data = filtered.to_dict("records")[0]

        searcher = SearchEpc(
            address1=str(p["address1"]),
            postcode=str(p["postcode"]),
            auth_token=EPC_AUTH_TOKEN,
            os_api_key="",
            uprn=uprn
        )
        searcher.ordnance_survey_client.parse_classification_code(p_os_data["CLASSIFICATION_CODE"])

        searcher.find_property(skip_os=True)

        final_epcs.append(
            {
                "internal_id": internal_id,
                **searcher.newest_epc
            }
        )

    final_epcs = pd.DataFrame(final_epcs)

    complete_epcs = pd.concat(
        [
            epc_data,
            final_epcs
        ]
    )

    # We now pull spatial data
    # We get the spatial file list and loop through each EPC and determine which file it needs.
    # We then just read in the files that we need and get the data, for each uprn from that file

    uprn_filenames = read_dataframe_from_s3_parquet(
        bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
    )

    uprn_lookup = {}
    for uprn in complete_epcs["uprn"]:
        if pd.isnull(uprn):
            # TODO: Do something about this!
            continue
        filtered_df = uprn_filenames[
            (uprn_filenames["lower"] <= int(uprn))
            & (uprn_filenames["upper"] >= int(uprn))
            ]
        if filtered_df["filenames"].values[0] in uprn_lookup:
            uprn_lookup[filtered_df["filenames"].values[0]].append(int(uprn))
        else:
            uprn_lookup[filtered_df["filenames"].values[0]] = [int(uprn)]

    spatial_data_to_uprn = []
    for filename, associated_uprn in tqdm(uprn_lookup.items(), total=len(uprn_lookup)):
        # Read in the file
        spatial_data = read_dataframe_from_s3_parquet(
            bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}"
        )

        spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
        spatial_data_to_uprn.append(spatial_df)

    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)

    # TODO: Let's store this in s3
    # save_data_to_s3(
    #     data=json.dumps(spatial_data_to_uprn.to_dict("records")),
    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
    #     bucket_name="retrofit-data-dev"
    # )

    # We merge this spatial data onto final EPCS


def concatenate_row(row):
    return ', '.join(row.dropna().replace('', None).dropna().astype(str))


def adjust_clusters(cluster_allocation, total_clusters):
    current_total = sum(cluster_allocation.values())
    adjustment = total_clusters - current_total
    if adjustment > 0:
        # Increase clusters, start from the largest group
        for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
            cluster_allocation[group] += 1
            adjustment -= 1
            if adjustment == 0:
                break
    elif adjustment < 0:
        # Decrease clusters, start from the largest group
        for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
            cluster_allocation[group] -= 1
            adjustment += 1
            if adjustment == 0:
                break
    return cluster_allocation


def compile_data_final():
    # Updated version:

    """
    Various data sources have been produced to create the final data source for Stonewater.
    This function combines them
    :return:
    """
    ########################################################################
    # Read in data
    ########################################################################

    asset_list = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
        header=4
    )

    udprn_data = pd.read_excel(
        "/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
    )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
    udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
    udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)

    asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
    asset_list = asset_list.rename(columns={"UDPRN": "udprn"})

    # Read in the lookups
    uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
    )))
    uprn_lookup_1["match_type"] = "Exact"

    uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
    )))
    uprn_lookup_2 = uprn_lookup_2.rename(
        columns={
            "epc_address": "standardised_address",
            "epc_postcode": "standardised_postcode"
        }
    )
    uprn_lookup_2["match_type"] = "EPC"
    uprn_lookup_2["uprn"] = np.where(
        uprn_lookup_2["internal_id"] == 1091,
        83143766,
        uprn_lookup_2["uprn"]
    )

    uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
    )))
    uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
        concatenate_row, axis=1
    )
    uprn_lookup_3 = uprn_lookup_3[
        ["udprn", "uprn", "standardised_address", "postcode"]
    ].rename(columns={"postcode": "standardised_postcode"})
    uprn_lookup_3["match_type"] = "Exact"

    uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
    uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
    uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
    # prepare lookup 4
    uprn_lookup_4 = []
    for _, x in uprn_lookup_4_basis.iterrows():

        property_type = None
        built_form = None
        if x["option"] == 1:
            uprn = x["os_option_1_uprn"]
            standardised_address = x["os_option_1_address"]
            postcode = x["os_option_1_postcode"]
        elif x["option"] == 2:
            uprn = x["os_option_2_uprn"]
            standardised_address = x["os_option_2_address"]
            postcode = x["os_option_2_address"].split(", ")[-1]
        else:
            uprn = x["manual_uprn"]
            standardised_address = x["manual_address"]
            postcode = x["manual_postcode"]

        uprn_lookup_4.append(
            {
                "internal_id": x["internal_id"],
                "external_address_id": x["external_address_id"],
                "uprn": uprn,
                "standardised_address": standardised_address,
                "standardised_postcode": postcode,
                "property_type": property_type,
                "built_form": built_form
            }
        )
    uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
    uprn_lookup_4["match_type"] = "Fuzzy"

    # concat
    uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])

    # We now merge all of the UPRNs onto the asset list
    assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)

    epc_data = json.loads(
        read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name="customers/Stonewater/clustering/epc_data.json"
        )
    )
    epc_data = pd.DataFrame(epc_data)

    epc_data["uprn"] = np.where(
        epc_data["internal_id"] == 1091,
        83143766,
        epc_data["uprn"]
    )

    # We drop come EPCS
    epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]

    # This we can use to produce additional variables such as number of old surveys
    # older_epc_data = json.loads(
    #     read_from_s3(
    #         bucket_name="retrofit-data-dev",
    #         s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
    #     )
    # )
    # older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}

    ########################################################################
    # Prepare asset list
    ########################################################################

    # Keep just the columns we're interested in
    asset_list = asset_list[
        [
            "Osm. ID",
            "Org. ref.",
            "Postcode",
            "House no",
            "Name",
            "Address line 2",
            "City/Town",
            "County",
            "Address ID",  # This is not uprn
            "udprn",
            "Owning body"
        ]
    ].rename(
        columns={
            "Osm. ID": "internal_id",
            "Org. ref.": "customer_asset_id",
            "Postcode": "postcode",
            "House no": "house_number",
            "Name": "address1",
            "Address line 2": "address2",
            "City/Town": "city_town",
            "County": "county",
            "Address ID": "external_address_id",
            "Owning body": "owner"
        }
    )

    # Create full address
    asset_list["full_address"] = np.where(
        ~pd.isnull(asset_list["address2"]),
        (
            asset_list["address1"] + ", " +
            asset_list["address2"] + ", " +
            asset_list["city_town"].str.title() + ", " +
            # asset_list["county"] + ", " +
            asset_list["postcode"]
        ),
        asset_list["address1"] + ", " +
        asset_list["city_town"].str.title() + ", " +
        # asset_list["county"] + ", " +
        asset_list["postcode"]
    )

    if pd.isnull(asset_list["full_address"]).sum():
        raise ValueError("Missing full addresses")

    # Final preps of lookups
    uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
    uprn_lookup_3 = uprn_lookup_3.merge(
        asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
    )
    uprn_lookup = pd.concat([
        uprn_lookup,
        uprn_lookup_3,
        uprn_lookup_4
    ])
    uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)

    asset_list = asset_list.merge(
        uprn_lookup.drop(columns=["udprn"]),
        how="inner",
        on=["internal_id", "external_address_id"]
    )

    # Store locally
    # asset_list.to_excel("Stonewater asset list with uprn.xlsx")

    # We take just domestic properties

    # This is the first ordnance survey data pull
    os_most_relevant_1 = []
    os_all_1 = {}
    for i in tqdm(["1", "2", "3"]):
        most_relevant_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
        )
        os_most_relevant_1.extend(json.loads(most_relevant_segment))
        os_all_segment = read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
        )
        os_all_1 = {**os_all_1, **json.loads(os_all_segment)}

    os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)

    # This is the second ordnance survey data pull
    os_most_relevant_2 = read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="customers/Stonewater/clustering/problematic_os.json"
    )
    os_most_relevant_2 = json.loads(os_most_relevant_2)
    os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)

    os_all_2 = read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
    )
    os_all_2 = json.loads(os_all_2)

    needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])]

    os_1_ids = os_most_relevant_1["internal_id"].values
    os_2_ids = os_most_relevant_2["internal_id"].values

    epc_data_batch_2 = []
    older_epcs_batch_2 = {}
    for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)):
        if pd.isnull(property["uprn"]):
            continue
        searcher = SearchEpc(
            address1=", ".join(property["standardised_address"].split(", ")[:-1]),
            postcode=property["standardised_postcode"],
            auth_token=EPC_AUTH_TOKEN,
            os_api_key="",
            full_address=property["standardised_address"],
            uprn=property["uprn"]
        )
        searcher.find_property(skip_os=True)

        if searcher.newest_epc is None and property["match_type"] == "Exact":
            # Estimate!
            # Get the OS data
            p_os_df = pd.DataFrame()
            if property["internal_id"] in os_1_ids:
                p_os_df = pd.DataFrame(
                    [x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]]
                )
                p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]]

            if p_os_df.empty:
                p_os_df = pd.DataFrame(
                    [x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]]
                )
                p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]

            if not p_os_df.empty:
                searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
            else:
                searcher.ordnance_survey_client.property_type = ""
            # Now we estimate
            searcher.newest_epc = searcher.estimate_epc(
                property_type=searcher.ordnance_survey_client.property_type,
                built_form=searcher.ordnance_survey_client.built_form,
                lmks_to_drop=None,
                exclude_old=True
            )

        elif searcher.newest_epc is None and property["match_type"] == "Fuzzy":

            if "flat" in property["standardised_address"].lower():
                searcher.newest_epc = searcher.estimate_epc(
                    property_type="Flat",
                    built_form=None,
                    lmks_to_drop=None,
                    exclude_old=True
                )
            else:
                searcher.newest_epc = searcher.estimate_epc(
                    property_type="House",
                    built_form=None,
                    lmks_to_drop=None,
                    exclude_old=True
                )

        epc_data_batch_2.append(
            {
                "internal_id": property["internal_id"],
                **searcher.newest_epc
            }
        )

        if searcher.older_epcs is not None:
            older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
    # Store in S3
    # TODO - read in instead of running
    # save_pickle_to_s3(
    #     data=epc_data_batch_2,
    #     s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
    #     bucket_name="retrofit-data-dev"
    # )
    #
    # save_pickle_to_s3(
    #     data=older_epcs_batch_2,
    #     s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
    #     bucket_name="retrofit-data-dev"
    # )

    epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
    complete_epcs = pd.concat([epc_data, epc_data_batch_2])

    # We now prepare the final data for clustering
    uprn_filenames = read_dataframe_from_s3_parquet(
        bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
    )

    uprn_map = {}
    for uprn in complete_epcs["uprn"]:
        filtered_df = uprn_filenames[
            (uprn_filenames["lower"] <= int(uprn))
            & (uprn_filenames["upper"] >= int(uprn))
            ]
        if filtered_df["filenames"].values[0] in uprn_map:
            uprn_map[filtered_df["filenames"].values[0]].append(int(uprn))
        else:
            uprn_map[filtered_df["filenames"].values[0]] = [int(uprn)]

    spatial_data_to_uprn = []
    for filename, associated_uprn in tqdm(uprn_map.items(), total=len(uprn_map)):
        # Read in the file
        spatial_data = read_dataframe_from_s3_parquet(
            bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}"
        )

        spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
        spatial_data_to_uprn.append(spatial_df)

    # TODO: Let's store this in s3
    # save_pickle_to_s3(
    #     data=spatial_data_to_uprn,
    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
    #     bucket_name="retrofit-data-dev"
    # )

    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)

    spatial_data_to_uprn = spatial_data_to_uprn.drop(
        columns=["partition", "filename"]
    ).rename(columns={"UPRN": "uprn"})
    spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)

    property_attributes = complete_epcs.merge(
        spatial_data_to_uprn,
        how="inner",
        on="uprn"
    )

    property_attributes = property_attributes.merge(
        asset_list[["internal_id", "owner", "match_type"]], how="left", on="internal_id"
    )

    # TODO: Add on data from the asset list such as ownership

    # We drop the columns we don't care about for clustering
    property_attributes = property_attributes.drop(
        columns=[
            "address",
            "uprn-source",
            "heating-cost-potential",
            "hot-water-cost-potential",
            "potential-energy-rating",
            "environment-impact-potential",
            "address3",
            "local-authority-label",
            "sheating-energy-eff",
            "local-authority-label",
            "county",
            "postcode",
            "constituency",
            "co2-emissions-potential",
            "energy-consumption-potential",
            "local-authority",
            "inspection-date",
            "address1",
            "constituency-label",
            "building-reference-number",
            "floor-energy-eff",
            "address2",
            "posttown",
            "floor-env-eff",
            "sheating-env-eff",
            "lighting-cost-potential",
            "main-heating-controls",
            "transaction-type",
            "uprn",
            "lodgement-date",
            "lmk-key",
            "wind-turbine-count",
            "tenure",
            "potential-energy-efficiency",
            "glazed-area"
        ]
    )

    # Fields to transform: lodgement-datetime
    property_attributes["days_since_last_epc"] = (
        datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"], errors="coerce")
    ).dt.days

    property_attributes = property_attributes.drop(columns=["lodgement-datetime"])

    # Up to:
    # Round averages to nearest integer
    fill_with_average = [
        "low-energy-fixed-light-count",
        "floor-height",
        "heating-cost-current",
        "fixed-lighting-outlets-count",
        "hot-water-cost-current",
        "number-heated-rooms",
        "co2-emiss-curr-per-floor-area",
        "total-floor-area",
        "environment-impact-current",
        "co2-emissions-current",
        "number-habitable-rooms",
        "energy-consumption-current",
        'lighting-cost-current',
        "low-energy-lighting",
    ]

    fill_with_mode = [
        "multi-glaze-proportion",
        "extension-count",
    ]

    fill_with_zero = [
        "unheated-corridor-length",
        "number-open-fireplaces",
        "photo-supply",
    ]

    fill_with_categorical = {
        "construction-age-band": "unknown",
        "mainheat-energy-eff": "N/A",
        "windows-env-eff": "N/A",
        "lighting-energy-eff": "N/A",
        "energy-tariff": 'NO DATA!',
        "mechanical-ventilation": 'NO DATA!',
        "solar-water-heating-flag": "N",
        "mains-gas-flag": "N",
        "heat-loss-corridor": "unknown",
        "flat-storey-count": "Not a flat",
        "roof-energy-eff": "N/A",
        "hot-water-env-eff": "N/A",
        "mainheatc-energy-eff": "N/A",
        "main-fuel": 'NO DATA!',
        "lighting-env-eff": "N/A",
        "windows-energy-eff": "N/A",
        "roof-env-eff": "N/A",
        "walls-env-eff": "N/A",
        "mainheat-env-eff": "N/A",
        "flat-top-storey": "N",
        "mainheatc-env-eff": "N",
        "floor-level": "NODATA!",
        "hot-water-energy-eff": "N/A",
        "glazed-type": "unknown"
    }

    # Consolidation columns to single value
    consolidation_columns = {
        "glazed-type": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
        "mechanical-ventilation": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
        "solar-water-heating-flag": {"from": [''], "to": "N"},
        "mains-gas-flag": {"from": [''], "to": "N"},
        "heat-loss-corridor": {"from": ['NO DATA!', ''], "to": "N"},
        "flat-top-storey": {"from": [''], "to": "N"},
        "floor-level": {"from": [""], "to": "NODATA!"}
    }

    # Perform the cleaning
    for col in fill_with_average:
        property_attributes[col] = property_attributes[col].replace('', None)
        avg_val = np.mean([float(x) for x in property_attributes[col].values if x not in [None, "", np.nan]])
        if pd.isnull(avg_val):
            raise Exception("something went wrong")
        property_attributes[col] = property_attributes[col].fillna(round(avg_val))
        property_attributes[col] = property_attributes[col].astype(float)

    for c in fill_with_zero:
        property_attributes[c] = property_attributes[c].replace('', 0)
        property_attributes[c] = property_attributes[c].fillna(0)
        property_attributes[c] = property_attributes[c].astype(float)

    for col in fill_with_mode:
        property_attributes[col] = property_attributes[col].replace('', None)
        mode_val = stats.mode([float(x) for x in property_attributes[col].values if x not in [None, "", np.nan]])[0]
        if pd.isnull(mode_val):
            raise Exception("something went wrong")
        property_attributes[col] = property_attributes[col].fillna(mode_val)
        property_attributes[col] = property_attributes[col].astype(float)

    for c, fill_val in fill_with_categorical.items():
        property_attributes[c] = property_attributes[c].replace('', fill_val)
        property_attributes[c] = property_attributes[c].fillna(fill_val)

    # Finally, consolidate
    for c, consolidate_config in consolidation_columns.items():
        for v in consolidate_config["from"]:
            property_attributes[c] = property_attributes[c].replace(v, consolidate_config["to"])

    property_attributes["estimated"] = property_attributes["estimated"].fillna(False)
    property_attributes["conservation_status"] = property_attributes["conservation_status"].fillna(False)
    property_attributes["days_since_last_epc"] = property_attributes["days_since_last_epc"].fillna(
        property_attributes["days_since_last_epc"].mean()
    )

    missings = pd.isnull(property_attributes).sum()
    missings = missings[missings > 0]

    # Save this
    # save_pickle_to_s3(
    #     data=property_attributes,
    #     bucket_name="retrofit-data-dev",
    #     s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
    # )

    # from utils.s3 import read_pickle_from_s3
    # property_attributes = read_pickle_from_s3(
    #     bucket_name="retrofit-data-dev",
    #     s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
    # )

    # We perform some additional cleaning on the data
    import msgpack
    cleaned = read_from_s3(
        s3_file_name="cleaned_epc_data/cleaned.bson",
        bucket_name="retrofit-data-dev"
    )

    cleaned = msgpack.unpackb(cleaned, raw=False)
    from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
    from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
    from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
    from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
    from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
    from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
    from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
    from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
    from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes

    cleaners = {
        "floor-description": FloorAttributes,
        'hotwater-description': HotWaterAttributes,
        'main-fuel': MainFuelAttributes,
        'mainheat-description': MainHeatAttributes,
        'mainheatcont-description': MainheatControlAttributes,
        'roof-description': RoofAttributes,
        'walls-description': WallAttributes,
        'windows-description': WindowAttributes,
        'lighting-description': LightingAttributes
    }

    for variable_to_clean in cleaned.keys():

        unique_descriptions = property_attributes[variable_to_clean].unique()
        clean_df = pd.DataFrame(cleaned[variable_to_clean])
        # Check if we have any
        missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
        if missed:
            descriptions_to_append = []
            for description in missed:
                if variable_to_clean == "lighting-description":
                    cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
                else:
                    cln = cleaners[variable_to_clean](description)
                to_append = {
                    "original_description": description,
                    "clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
                    **cln.process()
                }
                descriptions_to_append.append(to_append)

            descriptions_to_append = pd.DataFrame(descriptions_to_append)
            clean_df = pd.concat([clean_df, descriptions_to_append])

        clean_df = clean_df.rename(
            columns={
                "thermal_transmittance": f"{variable_to_clean}_thermal_transmittance",
                "is_assumed": f"{variable_to_clean}_is_assumed",
            }
        )

        if 'thermal_transmittance_unit' in clean_df.columns:
            clean_df = clean_df.drop(columns=['thermal_transmittance_unit'])

        starting_size = len(property_attributes)
        property_attributes = property_attributes.merge(
            clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
        )
        if starting_size != property_attributes.shape[0]:
            raise Exception("something went wrong")
        property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
        # Fill missings
        for k in clean_df.columns:
            if k in property_attributes.columns:
                property_attributes[k] = property_attributes[k].fillna("missing")

    # We group some variables such as thermal transmittance for walls, roof, floors
    # ranges = {
    #     "< 0.1": (0, 0.1),
    #     "0.1 - 0.3": (0.1, 0.3),
    #     "0.3 - 0.5": (0.3, 0.5),
    #     "0.5 - 0.7": (0.5, 0.7),
    #     "0.9 - 1": (0.9, 1),
    #     "1 - 1.5": (1, 1.5),
    #     "1.5 - 2": (1.5, 2),
    #     "2+": (2, 2.5)
    # }

    ranges = {
        "< 0.1": (0, 0.1),
        "0.1 - 0.3": (0.1, 0.3),
        "0.3 - 0.5": (0.3, 0.5),
        "0.5+": (0.5, 2.5),
    }

    # Generate the lookup table
    thermal_transmittance_lookup_table = []
    for i in range(1, 251):
        value = i / 100
        for label, (low, high) in ranges.items():
            if low < value <= high:
                thermal_transmittance_lookup_table.append({"from": value, "to": label})
                break

    # Convert to DataFrame for display
    thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
    thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)

    thermal_transmittance_cols = [
        c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c
    ]
    for i, col in enumerate(thermal_transmittance_cols):
        # Perform the mapping
        to_col = f"to_{col}"
        property_attributes[col] = property_attributes[col].astype(str)
        property_attributes = property_attributes.merge(
            thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
            how="left",
            left_on=col,
            right_on="from",
            suffixes=("", f"_{i}")
        )
        property_attributes = property_attributes.drop(columns=["from", col])
        property_attributes[to_col] = property_attributes[to_col].fillna("unknown")

    # Drop the description columns that are the keys in cleaned
    print("PUT ME BACK!!??")
    property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
    # Perform the mapping

    # CLUSTERING!!
    grouping_columns = [
        'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
    ]

    additional_features = [

    ]

    # Define the preprocessing for numerical and categorical features
    numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]

    for col in categorical_features:
        property_attributes[col] = property_attributes[col].astype(str)

    id_column = 'internal_id'
    n_clusters = 450
    random_state = 0

    training_data_grouped = property_attributes.groupby(grouping_columns)
    group_sizes = {name: len(group) for name, group in training_data_grouped}
    total_size = sum(group_sizes.values())
    cluster_allocation = {
        name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
    }

    # Adjust cluster allocation to ensure total clusters sum to 450
    cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)

    # TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to
    #       collect the results of the clustering and then perform the transformations afterwards

    final_clusters = []
    for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):

        group_n_clusters = cluster_allocation[group_variables]
        group_data.set_index(id_column, inplace=True)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_features),
                ('cat', OneHotEncoder(), categorical_features)
            ]
        )

        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])

        # Fit the pipeline to the data
        pipeline.fit(group_data)

        # Transform the data using the fitted pipeline
        processed_data = pipeline.named_steps['preprocessor'].transform(group_data)

        # Get cluster labels
        group_data['cluster'] = pipeline.named_steps['kmeans'].labels_

        # Get centroids (already in the same transformed space)
        centroids = pipeline.named_steps['kmeans'].cluster_centers_

        # if the data isn't an array, make it one
        if not isinstance(processed_data, np.ndarray):
            processed_data = processed_data.toarray()

        # Calculate distances from each point to the centroid of its cluster
        distances_to_centroids = [
            cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
            for i, label in enumerate(group_data['cluster'])
        ]

        group_data['distance_to_centroid'] = distances_to_centroids

        # for cluster_id in group_data['cluster'].unique():
        #     cluster_data = group_data[group_data['cluster'] == cluster_id]
        #     min_distance = cluster_data['distance_to_centroid'].min()
        #     print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
        #     if min_distance != 0:
        #         print(f"No point with zero distance found in cluster {cluster_id}")

        # Ranking rows by distance within each cluster
        group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')

        # Sorting to verify
        group_data.sort_values(by=['cluster', 'rank'], inplace=True)
        group_data.reset_index(inplace=True)

        to_append = group_data[["internal_id", "cluster", "rank"]].copy()
        to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
        final_clusters.append(to_append)

    final_clusters = pd.concat(final_clusters)
    # remap the clusters from the current names to 1 -> n_clusters

    cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
    final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
    final_clusters["cluster"] = final_clusters["cluster"].astype(str)

    ################################################
    # Prepare outputs!!!!
    ################################################

    property_attributes.reset_index(inplace=True)
    property_attributes = property_attributes.merge(
        final_clusters, how="left", on="internal_id"
    )
    property_attributes["archetype_representative"] = property_attributes["rank"] == 1

    asset_list_with_archetypes = asset_list.merge(
        property_attributes[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
        on="internal_id"
    )

    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].fillna(-999)
    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].astype(int).astype(str)
    asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].replace("-999", "NO ARCHETYPE")

    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")

    asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
        "archetype_representative"].fillna(False)

    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False)

    stonewater_uprn_lookup = asset_list_with_archetypes[
        ["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
    ]

    stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")


def pull_ideal_postcodes(missing_uprn_with_udprn):
    api_key = ""  # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
    import requests
    import time
    completed_id = 0

    uprn_to_udprn = []
    for row_index, data in tqdm(missing_uprn_with_udprn.iterrows(), total=len(missing_uprn_with_udprn)):
        if row_index < completed_id:
            continue
        time.sleep(0.5)

        # Call the API
        udprn = data["udprn"]

        url = f"https://api.ideal-postcodes.co.uk/v1/udprn/{udprn}?api_key={api_key}"

        payload = {
            "api_key": api_key
        }
        headers = {
            'Accept': 'application/json'
        }

        response = requests.request("GET", url, headers=headers, data=payload)
        if response.status_code != 200:
            raise ValueError("API call dead")

        result = response.json()
        uprn_to_udprn.append(
            result["result"]
        )
        completed_id += 1


def updated_version():
    """
    This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
    using fewer variables and also factoring in their internal data sources

    This work began on the 23rd July 2024
    :return:
    """

    ########################################################################
    # Read in data
    ########################################################################
    asset_list = read_asset_list()
    asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list)

    # Read in the properties that have been included in Osmosis' wave 2.1
    osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()

    asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)

    # We also check the address & postcode
    asset_list["In Osmosis Wave 2.1"] = np.where(
        asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
        True,
        asset_list["In Osmosis Wave 2.1"]
    )

    priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()

    # Pull in the EPC data
    epc_data = read_epc_data(uprn_lookup_2)

    # Pull in the spatial data to UPRN
    spatial_data_to_uprn = read_pickle_from_s3(
        s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
        bucket_name="retrofit-data-dev"
    )

    # Function to convert specific columns to bool dtype
    def convert_specific_columns_to_bool(df, columns):
        for column in columns:
            if column in df.columns:
                df[column] = df[column].astype(bool)
        return df

    spatial_data_to_uprn = [convert_specific_columns_to_bool(
        df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
    ) for df in spatial_data_to_uprn]

    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
    spatial_data_to_uprn = spatial_data_to_uprn.drop(
        columns=["partition", "filename"]
    ).rename(columns={"UPRN": "uprn"})
    spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)

    ########################################################################
    # Prepare the data
    ########################################################################

    # Filter the asset list down to the priority postcodes
    asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)

    master_sheet = master_sheet[
        master_sheet["Address ID"].isin(
            asset_list["external_address_id"].values
        )
    ]

    master_sheet["days_since_lodgement"] = (
        datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
    ).dt.days

    asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
        master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
        how="left",
        left_on="external_address_id",
        right_on="Address ID"
    )

    asset_list = asset_list.merge(
        epc_data[["internal_id", "current-energy-efficiency", "lodgement-date", "estimated"]],
        how="left",
        on="internal_id"
    )
    asset_list["days_since_lodgement_epc"] = (
        datetime.now() - pd.to_datetime(asset_list["lodgement-date"], errors="coerce", dayfirst=True)
    ).dt.days

    # Flag properties that were surveyed within the last 5 years
    asset_list["epc_within_5_years"] = asset_list["days_since_lodgement_epc"] < 5 * 365

    # Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
    # a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
    # the EPC is done
    asset_list["is_epc_c_or_above"] = (
        ((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
        (asset_list["EPC Rating"] >= 80)
    )

    clustering_features = asset_list[
        asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
        ~pd.isnull(asset_list["uprn"])
        ][
        [
            "internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
            "city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
            "epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
        ]
    ]

    # Merge on the SAP data
    clustering_features = clustering_features.merge(
        master_sheet[
            ["Address ID", "SAP"]
        ].rename(columns={"SAP": "parity_modelled_sap"}),
        how="left",
        left_on="external_address_id",
        right_on="Address ID"
    )

    # For SAP, we use the most recent EPC if epc_within_5_years is True, otherwise we use the parity modelled sap
    clustering_features["current-energy-efficiency"] = clustering_features["current-energy-efficiency"].astype(float)
    clustering_features["representative_sap"] = np.where(
        clustering_features["epc_within_5_years"],
        clustering_features["current-energy-efficiency"],
        clustering_features["parity_modelled_sap"]
    )

    # We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
    # is too many
    clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]

    # Merge on spatial features
    clustering_features = clustering_features.merge(
        spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
        how="left",
        on="uprn"
    )

    # incorect_epcs = clustering_features[
    #     clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
    # incorect_epcs = incorect_epcs[
    #     ~pd.isnull(incorect_epcs["current-energy-efficiency"]) & pd.isnull(incorect_epcs["estimated"])
    #     ]
    # incorect_epcs = incorect_epcs.rename(columns={"current-energy-efficiency": "Current SAP Rating"})
    # # Store data
    # incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)

    # We add in the key features, which are used for clustering
    master_sheet_clustering_features = master_sheet[
        ["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
    ].copy()

    # Step 1: Remap walls - we end up with 11 types
    master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
        {
            "TimberFrame: AsBuilt": "Other wall type, as built",
            "SystemBuilt: AsBuilt": "Other wall type, as built",
            "Sandstone: AsBuilt": "Other wall type, as built",
            "Sandstone: Internal": "Other wall type, internal or external",
            "SystemBuilt: External": "Other wall type, internal or external",
            "GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
            "TimberFrame: Internal": "Other wall type, internal or external",
            "Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
            "SystemBuilt: Internal": "Other wall type, internal or external",
            "Cavity: Internal": "Other wall type, internal or external",
        }
    )

    # Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
    #         gives us the insulation thickness

    # Clean an incorrect value
    master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
        {
            "PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
            "PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
            'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
            'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
            'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
        }
    )

    master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
        master_sheet_clustering_features['Roofs'].apply(
            lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
        )
    )

    # Strip any extra whitespace
    master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
    master_sheet_clustering_features['roof_insulation_thickness'] = (
        master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
    )

    def map_thickness(thickness):
        try:
            value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
            return "Above 250mm" if value > 250 else "Below 250mm"
        except ValueError:
            return thickness  # Return the original value if it cannot be converted to a float

    master_sheet_clustering_features['roof_insulation_category'] = (
        master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
    )

    # Ideas
    # 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
    # as a secondary category
    # 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
    # (could split on :, take first part)

    clustering_features = clustering_features.merge(
        master_sheet_clustering_features,
        how="left",
        on="Address ID"
    )

    # Reduce down to the final set of features we need
    clustering_features = clustering_features[
        [
            "internal_id",
            "Property Type",
            # Location
            "postal_region",
            'conservation_status',
            'is_listed_building',
            'is_heritage_building',
            "county",
            # Walls
            "walls_reduced",
            # Roof
            "roof_type",
            "roof_insulation_category",
            # Heating
            "Heating",
            # Fuel
            "Main Fuel",
            "Age",
            "Total Floor Area",
            "representative_sap",
            "days_since_lodgement",
        ]
    ]

    clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)

    def split_property_type(row):
        parts = row.split(':')
        property_type = parts[0].strip()
        built_form = parts[1].strip() if len(parts) > 1 else ''
        property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
        return pd.Series([property_type, built_form, property_extended_feature])

    clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
        clustering_features['Property Type'].apply(split_property_type)
    )
    clustering_features = clustering_features.drop(columns=["Property Type"])

    # These are the variables we MUST split by
    grouping_columns = [
        "property_type",
        "walls_reduced",
        "roof_type",
        "Main Fuel",
        "county",
    ]

    def combine_small_groups(clustering_features, grouping_columns, threshold=2):
        # Identify small groups
        group_sizes = clustering_features.groupby(grouping_columns).size()
        small_groups = group_sizes[group_sizes <= threshold].index.tolist()

        # Remove small groups from the original clustering_features
        small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
        clustering_features_ok = clustering_features[
            ~clustering_features.set_index(grouping_columns).index.isin(small_groups)
        ]

        if small_group_data.empty:
            return clustering_features

        # One-Hot Encode categorical variables
        categorical_features = (
            clustering_features_ok.drop(columns=["internal_id"])
            .select_dtypes(include=['object', 'category']).columns.tolist()
        )
        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        ohe.fit(clustering_features_ok[categorical_features])

        # Combine small groups with the nearest available group
        small_group_ohe = ohe.transform(small_group_data[categorical_features])
        large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])

        numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
        small_group_numerical = small_group_data[numerical_features].values
        large_group_numerical = clustering_features_ok[numerical_features].values

        # Concatenate one-hot encoded categorical and numerical features
        small_group_features = np.hstack([small_group_ohe, small_group_numerical])
        large_group_features = np.hstack([large_group_ohe, large_group_numerical])

        # Calculate distances and find nearest groups
        closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
        closest_group_index = clustering_features_ok.iloc[closest_groups].index

        # Update small groups to the nearest large group
        for small_group, closest_group in zip(small_groups, closest_group_index):
            small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
            small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
                closest_group, grouping_columns].values

        combined_data = pd.concat([clustering_features_ok, small_group_data])
        return combined_data

    clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)

    ########################################################################
    # Clustering
    ########################################################################
    numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]

    for col in categorical_features:
        clustering_features_combined[col] = clustering_features_combined[col].astype(str)

    id_column = 'internal_id'
    n_clusters = 450
    random_state = 0

    training_data_grouped = clustering_features_combined.groupby(grouping_columns)
    group_sizes = {name: len(group) for name, group in training_data_grouped}
    total_size = sum(group_sizes.values())
    cluster_allocation = {
        name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
    }

    # Adjust cluster allocation to ensure total clusters sum to 450
    cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)

    final_clusters = []
    for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):

        group_n_clusters = cluster_allocation[group_variables]
        group_data.set_index(id_column, inplace=True)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_features),
                ('cat', OneHotEncoder(), categorical_features)
            ]
        )

        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])

        # Fit the pipeline to the data
        pipeline.fit(group_data)

        # Transform the data using the fitted pipeline
        processed_data = pipeline.named_steps['preprocessor'].transform(group_data)

        # Get cluster labels
        group_data['cluster'] = pipeline.named_steps['kmeans'].labels_

        # Get centroids (already in the same transformed space)
        centroids = pipeline.named_steps['kmeans'].cluster_centers_

        # if the data isn't an array, make it one
        if not isinstance(processed_data, np.ndarray):
            processed_data = processed_data.toarray()

        # Calculate distances from each point to the centroid of its cluster
        distances_to_centroids = [
            cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
            for i, label in enumerate(group_data['cluster'])
        ]

        group_data['distance_to_centroid'] = distances_to_centroids

        # Ranking rows by distance within each cluster
        group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')

        # Sorting to verify
        group_data.sort_values(by=['cluster', 'rank'], inplace=True)
        group_data.reset_index(inplace=True)

        to_append = group_data[["internal_id", "cluster", "rank"]].copy()
        to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
        final_clusters.append(to_append)

    final_clusters = pd.concat(final_clusters)
    # remap the clusters from the current names to 1 -> n_clusters
    cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
    final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
    final_clusters["cluster"] = final_clusters["cluster"].astype(str)

    assigned_clusters = clustering_features_combined.merge(
        final_clusters, how="left", on="internal_id"
    )

    assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1

    asset_list_with_archetypes = asset_list.merge(
        assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
        on="internal_id"
    ).merge(
        master_sheet_clustering_features[["Address ID", "Property Type", "Walls", "Roofs", "Heating"]],
        how="left",
        on="Address ID"
    )

    # We populate the reasons for no archetype
    # 1) If it's not a priority postcode
    asset_list_with_archetypes["cluster"] = np.where(
        ~asset_list_with_archetypes["is_priority_postcode"],
        "NOT PRIORITY POSTCODE",
        asset_list_with_archetypes["cluster"]
    )

    # 2) If it's EPC C or above
    asset_list_with_archetypes["cluster"] = np.where(
        asset_list_with_archetypes["is_epc_c_or_above"],
        "EPC C OR ABOVE",
        asset_list_with_archetypes["cluster"]
    )

    # If it's in Wave 2.1
    asset_list_with_archetypes["cluster"] = np.where(
        asset_list_with_archetypes["In Osmosis Wave 2.1"],
        "IN WAVE 2.1",
        asset_list_with_archetypes["cluster"]
    )

    # Has missing uprn
    asset_list_with_archetypes["cluster"] = np.where(
        pd.isnull(asset_list_with_archetypes["uprn"]),
        "MISSING UPRN",
        asset_list_with_archetypes["cluster"]
    )

    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")

    asset_list_with_archetypes["archetype_representative"] = (
        asset_list_with_archetypes["archetype_representative"].fillna(False)
    )

    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.1.csv", index=False)

    # Produce the archetyping features
    archetyping_features_csv = assigned_clusters[
        [
            "internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
            "is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
        ]
    ].merge(
        asset_list[
            ["internal_id", "uprn", "external_address_id"]
        ],
        how="left",
        on="internal_id"
    ).merge(
        master_sheet_clustering_features,
        how="left",
        right_on="Address ID",
        left_on="external_address_id"
    ).drop(columns=["Address ID"]).rename(
        columns={
            "internal_id": "Osm. ID",
            "external_address_id": "Address ID",
        }
    )

    archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
    archetyping_features_csv.to_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
    )

    representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
    print(representatives["postal_region"].nunique())
    print(representatives["county"].nunique())


def read_asset_list():
    asset_list = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
        header=4
    )

    udprn_data = pd.read_excel(
        "/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
    )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
    udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
    udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)

    asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
    asset_list = asset_list.rename(columns={"UDPRN": "udprn"})

    asset_list = asset_list.rename(
        columns={
            "Osm. ID": "internal_id",
            "Org. ref.": "customer_asset_id",
            "Postcode": "postcode",
            "House no": "house_number",
            "Name": "address1",
            "Address line 2": "address2",
            "City/Town": "city_town",
            "County": "county",
            "Address ID": "external_address_id",
            "Owning body": "owner"
        }
    )

    asset_list["full_address"] = np.where(
        ~pd.isnull(asset_list["address2"]),
        (
            asset_list["address1"] + ", " +
            asset_list["address2"] + ", " +
            asset_list["city_town"].str.title() + ", " +
            asset_list["postcode"]
        ),
        asset_list["address1"] + ", " +
        asset_list["city_town"].str.title() + ", " +
        asset_list["postcode"]
    )
    return asset_list


def merge_uprn_to_asset_list(asset_list):
    # Read in the lookups
    uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
    )))
    uprn_lookup_1["match_type"] = "Exact"

    uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
    )))
    uprn_lookup_2 = uprn_lookup_2.rename(
        columns={
            "epc_address": "standardised_address",
            "epc_postcode": "standardised_postcode"
        }
    )
    uprn_lookup_2["match_type"] = "EPC"
    uprn_lookup_2["uprn"] = np.where(
        uprn_lookup_2["internal_id"] == 1091,
        83143766,
        uprn_lookup_2["uprn"]
    )

    uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
        bucket_name="retrofit-data-dev",
        s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
    )))
    uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
        concatenate_row, axis=1
    )
    uprn_lookup_3 = uprn_lookup_3[
        ["udprn", "uprn", "standardised_address", "postcode"]
    ].rename(columns={"postcode": "standardised_postcode"})
    uprn_lookup_3["match_type"] = "Exact"

    uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
    uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
    uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
    # prepare lookup 4
    uprn_lookup_4 = []
    for _, x in uprn_lookup_4_basis.iterrows():

        property_type = None
        built_form = None
        if x["option"] == 1:
            uprn = x["os_option_1_uprn"]
            standardised_address = x["os_option_1_address"]
            postcode = x["os_option_1_postcode"]
        elif x["option"] == 2:
            uprn = x["os_option_2_uprn"]
            standardised_address = x["os_option_2_address"]
            postcode = x["os_option_2_address"].split(", ")[-1]
        else:
            uprn = x["manual_uprn"]
            standardised_address = x["manual_address"]
            postcode = x["manual_postcode"]

        uprn_lookup_4.append(
            {
                "internal_id": x["internal_id"],
                "external_address_id": x["external_address_id"],
                "uprn": uprn,
                "standardised_address": standardised_address,
                "standardised_postcode": postcode,
                "property_type": property_type,
                "built_form": built_form
            }
        )
    uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
    uprn_lookup_4["match_type"] = "Fuzzy"

    # concat
    uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])

    assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)

    # Final preps of lookups
    uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
    uprn_lookup_3 = uprn_lookup_3.merge(
        asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
    )
    uprn_lookup = pd.concat([
        uprn_lookup,
        uprn_lookup_3,
        uprn_lookup_4
    ])
    uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)

    asset_list = asset_list.merge(
        uprn_lookup.drop(columns=["udprn"]),
        how="inner",
        on=["internal_id", "external_address_id"]
    )

    return asset_list, uprn_lookup_2


def read_omosis_wave_2_1():
    osmosis_wave_2_1 = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
        header=4,
    )
    # Remove double spaces from "Name"
    osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace("  ", " ")

    osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
    osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
    # We produce a cleaned list of asset ids from osmosis_wave_2_1
    osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
    # We have some ids that are in the form 'id1, id2' so we split them
    osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]

    return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1


def read_stonewater_asset_data():
    master_sheet = pd.read_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
        "sheet.csv",
        encoding='latin1'
    )

    master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)

    previous_waves = master_sheet[
        (master_sheet["In Osmosis W2.1"] == "Yes") |
        (master_sheet["In Wates Wave 2.1"] == "Yes") |
        (master_sheet["In Liv Green Wave 2.1"] == "Yes") |
        (master_sheet["In CCS Wave 2.1"] == "Yes")
        ].copy()

    previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]

    # We also read the priority postcodes
    priority_postcodes = pd.read_csv(
        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
        "postcodes.csv",
        header=17
    )

    priority_postcodes = priority_postcodes["Postcode"].tolist()

    return priority_postcodes, previous_waves_address_id, master_sheet


def read_epc_data(uprn_lookup_2):
    epc_data = json.loads(
        read_from_s3(
            bucket_name="retrofit-data-dev",
            s3_file_name="customers/Stonewater/clustering/epc_data.json"
        )
    )
    epc_data = pd.DataFrame(epc_data)

    epc_data["uprn"] = np.where(
        epc_data["internal_id"] == 1091,
        83143766,
        epc_data["uprn"]
    )

    # We drop come EPCS
    epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]

    epc_data_batch_2 = read_pickle_from_s3(
        s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
        bucket_name="retrofit-data-dev"
    )
    epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)

    complete_epcs = pd.concat([epc_data, epc_data_batch_2])

    return complete_epcs