Model/etl/customers/Futures Housing/validation_surveys.py

import pandas as pd


def get_band(sap_score_number):
    bands = [
        ("High_A", 96, float("inf")),
        ("Low_A", 92, 96),
        ("High_B", 86, 92),
        ("Low_B", 81, 86),
        ("High_C", 74.5, 81),
        ("Low_C", 69, 74.5),
        ("High_D", 61.5, 69),
        ("Low_D", 55, 61.5),
        ("High_E", 46.5, 55),
        ("Low_E", 39, 46.5),
        ("High_F", 29.5, 39),
        ("Low_F", 21, 29.5),
        ("High_G", 10.5, 21),
        ("Low_G", 1, 10.5),
    ]

    for band, lower, upper in bands:
        if lower <= sap_score_number < upper:
            return band

    return None


def classify_floor_area(floor_area):
    if floor_area <= 72:
        return "0-72"

    if floor_area <= 97:
        return "73-97"

    if floor_area <= 199:
        return "98-199"

    return "200+"


asset_list = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - Futures Housing/ECO 4 Wates - Standardised.xlsx",
    sheet_name="Standardised Asset List"
)

asset_list["starting_sap_band"] = asset_list["epc_sap_score_on_register"].apply(get_band)
asset_list["floor_area_band"] = asset_list["epc_total_floor_area"].apply(classify_floor_area)

# Objective:
# We need to get a reasonable estimate for the cost of works for properties that are EPC D or below
#
# Therefore:
# 1) We know that some properties that are currently EPC C may* qualify for ECO4 funding. Right now, we aren't trying
# to determine which EPC C properties or above will qualify, just how much works will cost for properties that do
# qualify
# 2) We cannot survey everything, so before we undetake too much risk we should produce some costings for each of the
# archetypes
#
# Driving Factors:
# 1) Floor area band & starting SAP band - this will determine how much funding is produced
# 2) Heating system - this will determine if the property needs a heating upgrade or not


archetypes = asset_list[asset_list["epc_sap_score_on_register"] <= 68].groupby(
    ["floor_area_band", "starting_sap_band", "landlord_heating_system"]
)["landlord_property_id"].nunique().reset_index()
archetypes = archetypes.rename(columns={"landlord_property_id": "n_properties"})
archetypes = archetypes.sort_values("n_properties", ascending=False)
archetypes["running_total"] = archetypes["n_properties"].cumsum()
archetypes["cumulative_percentage"] = archetypes["running_total"] / archetypes["n_properties"].sum() * 100

archetypes["is_electric"] = archetypes["landlord_heating_system"] != "boiler - other fuel"
archetypes["needs_heating_upgrade"] = archetypes["landlord_heating_system"].isin(
    ["boiler - other fuel", "electric storage heaters"]
)
archetypes = archetypes.reset_index(drop=True)

# Right now, they don't want to treat the oil properties so we'll exclude them for the moment
electric_heated_archetypes = (
    archetypes[archetypes["landlord_heating_system"] != "boiler - other fuel"].copy().reset_index(drop=True)
)
electric_heated_archetypes["running_total"] = electric_heated_archetypes["n_properties"].cumsum()
electric_heated_archetypes["cumulative_percentage"] = (
    electric_heated_archetypes["running_total"] / electric_heated_archetypes["n_properties"].sum() * 100
)

# The main properties that need validation surveys are properties that require a heating upgrade
electric_heated_archetypes = electric_heated_archetypes[electric_heated_archetypes["needs_heating_upgrade"]]
electric_heated_archetypes = electric_heated_archetypes.merge(
    archetypes[["starting_sap_band", "floor_area_band", "landlord_heating_system", "archetype_id"]],
    how="left", on=["starting_sap_band", "floor_area_band", "landlord_heating_system"]
)

oil_archetypes = archetypes[
    archetypes["landlord_heating_system"] == "boiler - other fuel"
    ].copy().reset_index(drop=True)

archetypes["archetype_id"] = archetypes.index

asset_list = asset_list.merge(
    archetypes[["starting_sap_band", "floor_area_band", "landlord_heating_system", "archetype_id"]],
    how="left", on=["starting_sap_band", "floor_area_band", "landlord_heating_system"]
)

properties_for_verification = asset_list[
    asset_list["archetype_id"].isin(electric_heated_archetypes["archetype_id"].values)
].copy()
properties_for_verification["postal_region"] = properties_for_verification["domna_postcode"].str.split(" ").str[
    0].str.strip()

properties_for_verification["epc_age"] = (
    pd.Timestamp.now() - pd.to_datetime(properties_for_verification["epc_inspection_date"])
).dt.days

# We also survey 2 oil heater properties, so we take the 2 most prevelant archetypes
archetypes_for_survey = pd.concat(
    [electric_heated_archetypes, oil_archetypes.head(2)]
)

# Take the property with the oldest EPC, by region. Prioritise estimated properties
sample = []
for _, config in archetypes_for_survey.iterrows():
    properties = asset_list[
        (asset_list["archetype_id"] == config["archetype_id"]) &
        (asset_list["floor_area_band"] == config["floor_area_band"]) &
        (asset_list["starting_sap_band"] == config["starting_sap_band"])
        ]

    if pd.isnull(properties["epc_inspection_date"]).sum():
        sample_property = properties[pd.isnull(properties["epc_inspection_date"])].head(1).to_dict("records")
    else:
        # Take the property with the oldest EPC
        sample_property = properties.sort_values("epc_inspection_date", ascending=True).head(1).to_dict("records")

    sample.extend(sample_property)

sample = pd.DataFrame(sample)

sample = sample[
    [
        "landlord_property_id", "epc_inspection_date", "epc_sap_score_on_register", "starting_sap_band",
        "floor_area_band", "landlord_heating_system", "domna_postcode", "domna_full_address", "archetype_id"
    ]
]

archetypes = asset_list[["landlord_property_id", "archetype_id"]].copy()
archetypes["archetype_id"] = archetypes["archetype_id"].astype(str)

filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - Futures Housing/archetypes.xlsx"
# Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data

with pd.ExcelWriter(filename) as writer:
    archetypes.to_excel(writer, sheet_name="Archetypes", index=False)
    sample.to_excel(writer, sheet_name="Survey Sample", index=False)

# We store this

# Questions:
# 1) If futures are considering changing properties that have oil heating systems, we could include them and
# we have 39 total archetypes. Otherwise, we have 25 archetypes
# 2) Can futures provide us with any information on the model of air source heat pumps and associated controls they're
# using

# Recommendations:
# 1) If they are willing to upgrade the heating systems of the oil properties, surveying 18 properties will cover
#