Model/etl/customers/thrive/Programme Analysis.py

"""
The Thrive programme has not been completed to specification. This script re-builds the programme and attempts to
address the following concerns:
- Which properties have been surveyed
- Of the properties that have been surveyed, what has been installed
- Which properties have been visited

"""

import pandas as pd
from tqdm import tqdm
from backend.SearchEpc import SearchEpc

# This is Thrive's list of properties and when they should have been surveyed
thrive_tracker = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List - "
    "Standardised.xlsx",
    sheet_name="Tracker",
    header=2
)

original_asset_list = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List.xlsx",
    header=0
)

# Find properties that are on the thrive tracker that
missed_properties = thrive_tracker[
    ~thrive_tracker["UPRN"].astype(str).isin(original_asset_list["Client ref 1"].astype(str).values)
].copy()

# We produce the complete list, with all of the columns we need, for data standardisation

original_columns = {
    "Client ref 1": "thrive_property_id",
    "Address": "full_address",
    "Address Line 1": "address_line_1",
    "Address Line 2": "address_line_2",
    "Address Line 3": "address_line_3",
    "Address Line 4": "address_line_4",
    "County": "county",
    "Postcode": "postcode",
    "Block Name": "block_reference",
    "Construction Year": "construction_year",
    "Age band (calculated)": "age_band_calculated",
    "Property type": "property_type",
    "Client property type 1": "detailed_property_type",
    "Client property type 2": "detailed_property_type_2",
    "bed count": "number_of_bedrooms",
    "Heating Type": "heating_type",
    "WFT Findings": "WFT Findings",
    "ECO Eligibility": "ECO Eligibility",
}

original_asset_list = original_asset_list[original_columns.keys()].rename(columns=original_columns)
original_asset_list["Data Source"] = "Original Asset List"
original_asset_list = original_asset_list.drop_duplicates()

# We append on the missed properties, with the information we have
missed_properties["Full Address"] = (
    missed_properties["#"].astype(str) + ", " +
    missed_properties["Adress Line 1"].astype(str) + ", " +
    missed_properties["Postcode"].astype(str)
)
missed_columns = {
    "UPRN": "thrive_property_id",
    "Full Address": "full_address",
    "Short Address": "address_line_1",
    "Postcode": "postcode",
    "Property Type": "property_type",
    "Build Form": "build_form",
    "Build year": "age_band_calculated",
    "Assumed mm ": "assumed_loft_insulation_thickness",
    "SAP": "sap_rating",
}

missed_properties = missed_properties[missed_columns.keys()].rename(columns=missed_columns)
missed_properties["WFT Findings"] = "Property Not Inspected"
missed_properties["ECO Eligibility"] = "Property Not Inspected"
missed_properties["Data Source"] = "Thrive Tracker"

# We de-dupe ides in original_asset_list
dupe_ids = original_asset_list[original_asset_list["thrive_property_id"].duplicated()]["thrive_property_id"].unique()
dupes = original_asset_list[
    original_asset_list["thrive_property_id"].isin(dupe_ids)
].copy()
dupes = dupes.sort_values("thrive_property_id")

original_asset_list = original_asset_list.rename(
    columns={
        "detailed_property_type": "build_form"
    }
)

master_list = pd.concat([missed_properties, original_asset_list], ignore_index=True)

# We were provided with a data update for a sample of properties. We update the data with this information
data_update = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Property List "
    "13_05.xlsx",
    header=0
)

new_properties = data_update[~data_update["UPRN"].isin(master_list["thrive_property_id"].astype(str).values)].copy()
new_properties["Full Address"] = (
    new_properties["#"].astype(str) + ", " +
    new_properties["Adress Line 1"].astype(str) + ", " +
    new_properties["Postcode"].astype(str)
)
new_properties = new_properties[missed_columns.keys()].rename(columns=missed_columns)
new_properties["WFT Findings"] = "Property Not Inspected"
new_properties["ECO Eligibility"] = "Property Not Inspected"
new_properties["Data Source"] = "13.05.2025 Data Update"

master_list = pd.concat([new_properties, master_list])

# We append any new data on heating system, heating type, and insulation type, based on the data update
master_list = master_list.merge(
    data_update[["UPRN", "Heating Type", "Assumed mm ", "SAP"]].rename(
        columns={
            "Heating Type": "heating_type_updated",
            "Assumed mm ": "assumed_loft_insulation_thickness_updated",
            "SAP": "sap_rating_updated"
        }
    ),
    how="left",
    left_on="thrive_property_id",
    right_on="UPRN"
)

# We fill the missings
master_list["heating_type_updated"] = master_list["heating_type_updated"].fillna(master_list["heating_type"])
master_list["assumed_loft_insulation_thickness_updated"] = master_list[
    "assumed_loft_insulation_thickness_updated"
].fillna(master_list["assumed_loft_insulation_thickness"])
master_list["sap_rating_updated"] = master_list["sap_rating_updated"].fillna(master_list["sap_rating"])

assert not master_list["thrive_property_id"].duplicated().sum(), "Duplicate thrive_property_id found in master_list"

master_list["Address in tracker"] = master_list["thrive_property_id"].astype(str).isin(
    thrive_tracker["UPRN"].astype(str).values
)

# Those the asset list - call it master asset list updated May2025
master_list = master_list.drop(columns=["UPRN"])
master_list["thrive_property_id"] = master_list["thrive_property_id"].astype(str)
# master_list.to_excel(
#     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List - "
#     "Complete - Updated May 2025.xlsx",
# )

master_list["house_number_TEMP"] = master_list.apply(
    lambda x: SearchEpc.get_house_number(address=x["full_address"], postcode=x["postcode"]),
    axis=1
)

# We add in the status of the property
# TODO: Add the status of the property from the Thrive tracker
outcomes = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive - Outcomes - April "
    "24-March25.xlsx",
    header=0
)
outcomes["row_id"] = outcomes.index

# We have two ids which have the same phohe. nymber, but different UPRN, so we don't match to the tracker for these
tracker_for_matching = thrive_tracker[
    ~thrive_tracker["UPRN"].isin(['OAKGRE0065ABBLDW1', 'OAKGRE0066ABBLDW1', 'JACKET0102ABBLDW1', 'BELLCL0008BEDMDW1'])
].copy()
tracker_for_matching["Full Address"] = (
    tracker_for_matching["#"].astype(str) + ", " +
    tracker_for_matching["Adress Line 1"].astype(str) + ", " +
    tracker_for_matching["Postcode"].astype(str)
)

outcomes_id_lookup = []
for _, x in tqdm(outcomes.iterrows(), total=len(outcomes)):

    hn = str(x["No."])
    address = x["Address"]
    postcode = x["Postcode"]
    contact_no = str(x["Contact No"]) if not pd.isnull(x["Contact No"]) else str(x["Contact No.1"])
    contact_no = None if contact_no == "nan" else contact_no

    if address == "292 Micklefield Road":
        hn = "292"

    if (address == "Micklefield Road") & (hn == "302"):
        hn = "292"

    if (address == "103a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
        hn = "103a"

    if (address == "105a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
        hn = "105a"

    if (address == "107a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
        hn = "107a"

    #
    # # We match this to the tracker
    # m1 = tracker_for_matching[tracker_for_matching["Primary Number"].astype(str) == contact_no]
    # # Many of the phone numbers don't have a leading zero in the tracker so we add them
    # if (m1.shape[0] != 1) and not pd.isnull(contact_no):
    #     m1 = tracker_for_matching[tracker_for_matching["Primary Number"].astype(str) == contact_no.lstrip("0")]
    #
    # if m1.shape[0] > 1:
    #     raise ValueError(
    #         f"Error for {hn} - {address} - {postcode} - {contact_no} in the tracker"
    #     )

    # if m1.empty:
    m1 = tracker_for_matching[
        (tracker_for_matching["#"].astype(str) == hn) &
        (tracker_for_matching["Postcode"] == postcode)
        ]

    if m1.empty:
        # Some properties aren't in the tracker, we match to the master list
        m1 = master_list[
            (master_list["house_number_TEMP"].astype(str) == hn) &
            (master_list["postcode"] == postcode)
            ]
        outcomes_id_lookup.append(
            {
                "row_id": x["row_id"],
                "thrive_property_id": m1["thrive_property_id"].values[0],
                "address": m1["full_address"].values[0],
                "postcode": m1["postcode"].values[0],
            }
        )
        continue

    if m1.shape[0] != 1:
        raise ValueError(
            f"Error for {hn} - {address} - {postcode} - {contact_no} in the tracker"
        )

    # We add the status to the master list
    outcomes_id_lookup.append(
        {
            "row_id": x["row_id"],
            "thrive_property_id": m1["UPRN"].values[0],
            "address": m1["Full Address"].values[0],
            "postcode": m1["Postcode"].values[0],
        }
    )

outcomes_id_lookup = pd.DataFrame(outcomes_id_lookup)
outcomes = outcomes.merge(
    outcomes_id_lookup,
    how="left",
    left_on="row_id",
    right_on="row_id"
)

outcomes = outcomes.drop(columns=["row_id"])
outcomes = outcomes.rename(
    columns={
        "Outcomes": "Outcome",
        "Notes                                                                                         (If 'no "
        "answer' under outcomes, have you checked around the property for access issues where possible?)": "Notes",
    }
)


# Store the corrected outcomes
# outcomes.to_excel(
#     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive - Outcomes -
#     April 24-March25 - Corrected.xlsx",
#     index=False
# )


def parse_date(value):
    # Strip any 'W.C' or 'w/c' prefix and clean whitespace
    value = value.strip().lower().replace('w.c', '').replace('w/c', '').strip()
    try:
        # Try parsing the date with dayfirst=True
        return pd.to_datetime(value, dayfirst=True, errors='coerce')
    except Exception:
        return pd.NaT


outcomes['Parsed Date'] = outcomes['Date letters sent'].apply(parse_date)

# Next step - match the submissions master to the asset list. We will append on the UPRN
eco3_submissions = pd.read_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
    "ECO3.csv",
    header=0
)
eco3_submissions["row_id"] = eco3_submissions.index

eco4_submissions = pd.read_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
    "ECO4.csv",
    header=0
)
eco4_submissions["row_id"] = eco4_submissions.index

# List of properties never on the asset list
not_on_master = [
    "7+FOXGROVE PATH+WD19 6YL", "9+FOXGROVE PATH+WD19 6YL", "11+FOXGROVE PATH+WD19 6YL",
    "20+LINCOLN DRIVE+WD19 7BA", "22+LINCOLN DRIVE+WD19 7BA", "24+LINCOLN DRIVE+WD19 7BA",
    "26+LINCOLN DRIVE+WD19 7BA", "1+Ryman Court, Stag Lane+WD3 5HN", "6+Ryman Court, Stag Lane+WD3 5HN",
    "9+Ryman Court, Stag Lane+WD3 5HN", "10+Ryman Court, Stag Lane+WD3 5HN", "11+Ryman Court, Stag Lane+WD3 5HN",
    "12+Ryman Court, Stag Lane+WD3 5HN", "14+Ryman Court, Stag Lane+WD3 5HN", "15+Ryman Court, Stag Lane+WD3 5HN",
    "20+Ryman Court, Stag Lane+WD3 5HN", "21+Ryman Court, Stag Lane+WD3 5HN", "22+Ryman Court, Stag Lane+WD3 5HN",
    "25+Ryman Court, Stag Lane+WD3 5HN", "26+Ryman Court, Stag Lane+WD3 5HN", "31+Ryman Court, Stag Lane+WD3 5HN",
    "33+Ryman Court, Stag Lane+WD3 5HN", "34+Ryman Court, Stag Lane+WD3 5HN",
    '37+Ryman Court, Stag Lane+WD3 5HN', '38+Ryman Court, Stag Lane+WD3 5HN', '39+Ryman Court, Stag Lane+WD3 5HN',
    '41+Ryman Court, Stag Lane+WD3 5HN', '43+Ryman Court, Stag Lane+WD3 5HN', '45+Ryman Court, Stag Lane+WD3 5HN',
    '46+Ryman Court, Stag Lane+WD3 5HN', '48+Ryman Court, Stag Lane+WD3 5HN', '49+Ryman Court, Stag Lane+WD3 5HN',
    '50+Ryman Court, Stag Lane+WD3 5HN', '52+Ryman Court, Stag Lane+WD3 5HN'
]

eco3_remap = {
    "19+OAKHILL ROAD+WD5 8RE": ('19', 'OAKHILL ROAD', 'WD3 9RE'),
    "29+OAKHILL ROAD+WD5 8RE": ('29', 'OAKHILL ROAD', 'WD3 9RE'),
    "31+OAKHILL ROAD+WD5 8RE": ('31', 'OAKHILL ROAD', 'WD3 9RE'),
    "44+OAKHILL ROAD+WD5 8RE": ('44', 'OAKHILL ROAD', 'WD3 9RF'),
    "64+OAKHILL ROAD+WD4 8RF": ('64', 'OAKHILL ROAD', 'WD3 9RF'),
    "11+LANCASTER WAY+WD3 PRE": ('11', 'LANCASTER WAY', 'WD5 0PQ'),
    "16+LANCASTER WAY+WD3 PRE": ('16', 'LANCASTER WAY', 'WD5 0PQ'),
    "58+TALBOT ROAD +WD31HE": ('58', 'TALBOT ROAD', 'WD3 1HE'),
    "10+PEARTREE COURT/WELWYN GARDEN CITY+AL73XN": ('10', 'PEARTREE COURT/WELWYN GARDEN CITY', 'AL7 3XN'),
    "25+GOBLINS GREEN/WELWYN GARDEN CITY+AL73ST": ('25', 'GOBLINS GREEN/WELWYN GARDEN CITY', 'AL7 3ST'),
    "32+GOBLINS GREEN/WELWYN GARDEN CITY+AL73ST": ('32', 'GOBLINS GREEN/WELWYN GARDEN CITY', 'AL7 3ST'),
    "94+BAKER ST/POTTERS BAR+EN62EP": ('94', 'BAKER ST/POTTERS BAR', 'EN6 2EP'),
    '33+Tudor Way+WD3JA': ('33', 'Tudor Way', 'WD3 8JA'),
    '120+Hazlewood lane +WD5 0HF': ('120', 'Hazlewood lane', 'WD5 0HE'),
    '35+Rosehill gardens +WD5 0HE': ('35', 'Rosehill gardens', 'WD5 0HF'),
    '18+Rosehill gardens +WD5 0HE': ('18', 'Rosehill gardens', 'WD5 0HF'),
    '34+Rosehill gardens +WD5 0HE': ('34', 'Rosehill gardens', 'WD5 0HF'),
    '58+Rosehill gardens +WD5 0HE': ('58', 'Rosehill gardens', 'WD5 0HF'),
    '48+Rosehill gardens +WD5 0HE': ('48', 'Rosehill gardens', 'WD5 0HF'),
    '45+Rosehill gardens +WD5 0HE': ('45', 'Rosehill gardens', 'WD5 0HF'),
    '6+Rosehill gardens +WD5 0HE': ('6', 'Rosehill gardens', 'WD5 0HF'),
    '2+Rosehill gardens +WD5 0HE': ('2', 'Rosehill gardens', 'WD5 0HF'),
    '29+Rosehill gardens +WD5 0HE': ('29', 'Rosehill gardens', 'WD5 0HF'),
    '61+GOLDEN DELL+AL8 4EE': ('61', 'GOLDEN DELL', 'AL7 4EE'),
    '2O+EDINBURGH AVENUE+WD3 8LB': ('20', 'EDINBURGH AVENUE', 'WD3 8LB'),
}

eco3_lookup = []
for _, row in tqdm(eco3_submissions.iterrows(), total=len(eco3_submissions)):
    hn = row["NO "]
    pc = row["Post Code"]
    street = row["Street / Block Name"]
    key = f"{hn}+{street}+{pc}"
    if key in not_on_master:
        continue

    if key in eco3_remap:
        hn, street, pc = eco3_remap[key]
        # The postcode is different to the asse

    # We filter the asset list, because it's hard to know how accurate this is
    m1 = master_list[
        (master_list["house_number_TEMP"].astype(str) == hn) &
        (master_list["postcode"] == pc)
        ]

    if m1.shape[0] != 1:
        raise ValueError(
            f"Error for {key} in the tracker"
        )

    eco3_lookup.append(
        {
            "row_id": row["row_id"],
            "thrive_property_id": m1["thrive_property_id"].values[0],
            "submission_house_number": row["NO "],
            "submission_address1": row["Street / Block Name"],
            "submission_postcode": row["Post Code"],
        }
    )

eco4_lookup = []
for _, row in tqdm(eco4_submissions.iterrows(), total=len(eco4_submissions)):
    hn = row["NO."]
    pc = row["Post Code"]
    street = row["Street / Block Name"]
    key = f"{hn}+{street}+{pc}"
    if key in not_on_master:
        continue

    if key in eco3_remap:
        hn, street, pc = eco3_remap[key]
        # The postcode is different to the asse

    # We filter the asset list, because it's hard to know how accurate this is
    m1 = master_list[
        (master_list["house_number_TEMP"].astype(str) == hn) &
        (master_list["postcode"].str.lower() == pc.lower())
        ]

    if m1.shape[0] != 1:
        raise ValueError(
            f"Error for {key} in the tracker"
        )

    eco4_lookup.append(
        {
            "row_id": row["row_id"],
            "thrive_property_id": m1["thrive_property_id"].values[0],
            "submission_house_number": row["NO."],
            "submission_address1": row["Street / Block Name"],
            "submission_postcode": row["Post Code"],
        }
    )

# We match the lookups back to the submission sheets
eco3_lookup = pd.DataFrame(eco3_lookup)
eco3_submissions = eco3_submissions.merge(
    eco3_lookup,
    how="left",
    on="row_id",
)

eco4_lookup = pd.DataFrame(eco4_lookup)
eco4_submissions = eco4_submissions.merge(
    eco4_lookup,
    how="left",
    on="row_id",
)

# Store
eco3_submissions.to_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
    "ECO3 - with IDS.csv",
    index=False
)
eco4_submissions.to_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
    "ECO4 - with IDS.csv",
    index=False
)