Model/etl/customers/Futures Housing/validation_surveys.py
2025-06-17 15:59:07 +01:00

167 lines
6.6 KiB
Python

import pandas as pd
def get_band(sap_score_number):
bands = [
("High_A", 96, float("inf")),
("Low_A", 92, 96),
("High_B", 86, 92),
("Low_B", 81, 86),
("High_C", 74.5, 81),
("Low_C", 69, 74.5),
("High_D", 61.5, 69),
("Low_D", 55, 61.5),
("High_E", 46.5, 55),
("Low_E", 39, 46.5),
("High_F", 29.5, 39),
("Low_F", 21, 29.5),
("High_G", 10.5, 21),
("Low_G", 1, 10.5),
]
for band, lower, upper in bands:
if lower <= sap_score_number < upper:
return band
return None
def classify_floor_area(floor_area):
if floor_area <= 72:
return "0-72"
if floor_area <= 97:
return "73-97"
if floor_area <= 199:
return "98-199"
return "200+"
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - Futures Housing/ECO 4 Wates - Standardised.xlsx",
sheet_name="Standardised Asset List"
)
asset_list["starting_sap_band"] = asset_list["epc_sap_score_on_register"].apply(get_band)
asset_list["floor_area_band"] = asset_list["epc_total_floor_area"].apply(classify_floor_area)
# Objective:
# We need to get a reasonable estimate for the cost of works for properties that are EPC D or below
#
# Therefore:
# 1) We know that some properties that are currently EPC C may* qualify for ECO4 funding. Right now, we aren't trying
# to determine which EPC C properties or above will qualify, just how much works will cost for properties that do
# qualify
# 2) We cannot survey everything, so before we undetake too much risk we should produce some costings for each of the
# archetypes
#
# Driving Factors:
# 1) Floor area band & starting SAP band - this will determine how much funding is produced
# 2) Heating system - this will determine if the property needs a heating upgrade or not
archetypes = asset_list[asset_list["epc_sap_score_on_register"] <= 68].groupby(
["floor_area_band", "starting_sap_band", "landlord_heating_system"]
)["landlord_property_id"].nunique().reset_index()
archetypes = archetypes.rename(columns={"landlord_property_id": "n_properties"})
archetypes = archetypes.sort_values("n_properties", ascending=False)
archetypes["running_total"] = archetypes["n_properties"].cumsum()
archetypes["cumulative_percentage"] = archetypes["running_total"] / archetypes["n_properties"].sum() * 100
archetypes["is_electric"] = archetypes["landlord_heating_system"] != "boiler - other fuel"
archetypes["needs_heating_upgrade"] = archetypes["landlord_heating_system"].isin(
["boiler - other fuel", "electric storage heaters"]
)
archetypes = archetypes.reset_index(drop=True)
# Right now, they don't want to treat the oil properties so we'll exclude them for the moment
electric_heated_archetypes = (
archetypes[archetypes["landlord_heating_system"] != "boiler - other fuel"].copy().reset_index(drop=True)
)
electric_heated_archetypes["running_total"] = electric_heated_archetypes["n_properties"].cumsum()
electric_heated_archetypes["cumulative_percentage"] = (
electric_heated_archetypes["running_total"] / electric_heated_archetypes["n_properties"].sum() * 100
)
# The main properties that need validation surveys are properties that require a heating upgrade
electric_heated_archetypes = electric_heated_archetypes[electric_heated_archetypes["needs_heating_upgrade"]]
electric_heated_archetypes = electric_heated_archetypes.merge(
archetypes[["starting_sap_band", "floor_area_band", "landlord_heating_system", "archetype_id"]],
how="left", on=["starting_sap_band", "floor_area_band", "landlord_heating_system"]
)
oil_archetypes = archetypes[
archetypes["landlord_heating_system"] == "boiler - other fuel"
].copy().reset_index(drop=True)
archetypes["archetype_id"] = archetypes.index
asset_list = asset_list.merge(
archetypes[["starting_sap_band", "floor_area_band", "landlord_heating_system", "archetype_id"]],
how="left", on=["starting_sap_band", "floor_area_band", "landlord_heating_system"]
)
properties_for_verification = asset_list[
asset_list["archetype_id"].isin(electric_heated_archetypes["archetype_id"].values)
].copy()
properties_for_verification["postal_region"] = properties_for_verification["domna_postcode"].str.split(" ").str[
0].str.strip()
properties_for_verification["epc_age"] = (
pd.Timestamp.now() - pd.to_datetime(properties_for_verification["epc_inspection_date"])
).dt.days
# We also survey 2 oil heater properties, so we take the 2 most prevelant archetypes
archetypes_for_survey = pd.concat(
[electric_heated_archetypes, oil_archetypes.head(2)]
)
# Take the property with the oldest EPC, by region. Prioritise estimated properties
sample = []
for _, config in archetypes_for_survey.iterrows():
properties = asset_list[
(asset_list["archetype_id"] == config["archetype_id"]) &
(asset_list["floor_area_band"] == config["floor_area_band"]) &
(asset_list["starting_sap_band"] == config["starting_sap_band"])
]
if pd.isnull(properties["epc_inspection_date"]).sum():
sample_property = properties[pd.isnull(properties["epc_inspection_date"])].head(1).to_dict("records")
else:
# Take the property with the oldest EPC
sample_property = properties.sort_values("epc_inspection_date", ascending=True).head(1).to_dict("records")
sample.extend(sample_property)
sample = pd.DataFrame(sample)
sample = sample[
[
"landlord_property_id", "epc_inspection_date", "epc_sap_score_on_register", "starting_sap_band",
"floor_area_band", "landlord_heating_system", "domna_postcode", "domna_full_address", "archetype_id"
]
]
archetypes = asset_list[["landlord_property_id", "archetype_id"]].copy()
archetypes["archetype_id"] = archetypes["archetype_id"].astype(str)
filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - Futures Housing/archetypes.xlsx"
# Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
with pd.ExcelWriter(filename) as writer:
archetypes.to_excel(writer, sheet_name="Archetypes", index=False)
sample.to_excel(writer, sheet_name="Survey Sample", index=False)
# We store this
# Questions:
# 1) If futures are considering changing properties that have oil heating systems, we could include them and
# we have 39 total archetypes. Otherwise, we have 25 archetypes
# 2) Can futures provide us with any information on the model of air source heat pumps and associated controls they're
# using
# Recommendations:
# 1) If they are willing to upgrade the heating systems of the oil properties, surveying 18 properties will cover
#