Model/etl/customers/thrive/Programme Analysis.py
2025-05-20 15:59:38 +01:00

439 lines
17 KiB
Python

"""
The Thrive programme has not been completed to specification. This script re-builds the programme and attempts to
address the following concerns:
- Which properties have been surveyed
- Of the properties that have been surveyed, what has been installed
- Which properties have been visited
"""
import pandas as pd
from tqdm import tqdm
from backend.SearchEpc import SearchEpc
# This is Thrive's list of properties and when they should have been surveyed
thrive_tracker = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List - "
"Standardised.xlsx",
sheet_name="Tracker",
header=2
)
original_asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List.xlsx",
header=0
)
# Find properties that are on the thrive tracker that
missed_properties = thrive_tracker[
~thrive_tracker["UPRN"].astype(str).isin(original_asset_list["Client ref 1"].astype(str).values)
].copy()
# We produce the complete list, with all of the columns we need, for data standardisation
original_columns = {
"Client ref 1": "thrive_property_id",
"Address": "full_address",
"Address Line 1": "address_line_1",
"Address Line 2": "address_line_2",
"Address Line 3": "address_line_3",
"Address Line 4": "address_line_4",
"County": "county",
"Postcode": "postcode",
"Block Name": "block_reference",
"Construction Year": "construction_year",
"Age band (calculated)": "age_band_calculated",
"Property type": "property_type",
"Client property type 1": "detailed_property_type",
"Client property type 2": "detailed_property_type_2",
"bed count": "number_of_bedrooms",
"Heating Type": "heating_type",
"WFT Findings": "WFT Findings",
"ECO Eligibility": "ECO Eligibility",
}
original_asset_list = original_asset_list[original_columns.keys()].rename(columns=original_columns)
original_asset_list["Data Source"] = "Original Asset List"
original_asset_list = original_asset_list.drop_duplicates()
# We append on the missed properties, with the information we have
missed_properties["Full Address"] = (
missed_properties["#"].astype(str) + ", " +
missed_properties["Adress Line 1"].astype(str) + ", " +
missed_properties["Postcode"].astype(str)
)
missed_columns = {
"UPRN": "thrive_property_id",
"Full Address": "full_address",
"Short Address": "address_line_1",
"Postcode": "postcode",
"Property Type": "property_type",
"Build Form": "build_form",
"Build year": "age_band_calculated",
"Assumed mm ": "assumed_loft_insulation_thickness",
"SAP": "sap_rating",
}
missed_properties = missed_properties[missed_columns.keys()].rename(columns=missed_columns)
missed_properties["WFT Findings"] = "Property Not Inspected"
missed_properties["ECO Eligibility"] = "Property Not Inspected"
missed_properties["Data Source"] = "Thrive Tracker"
# We de-dupe ides in original_asset_list
dupe_ids = original_asset_list[original_asset_list["thrive_property_id"].duplicated()]["thrive_property_id"].unique()
dupes = original_asset_list[
original_asset_list["thrive_property_id"].isin(dupe_ids)
].copy()
dupes = dupes.sort_values("thrive_property_id")
original_asset_list = original_asset_list.rename(
columns={
"detailed_property_type": "build_form"
}
)
master_list = pd.concat([missed_properties, original_asset_list], ignore_index=True)
# We were provided with a data update for a sample of properties. We update the data with this information
data_update = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Property List "
"13_05.xlsx",
header=0
)
new_properties = data_update[~data_update["UPRN"].isin(master_list["thrive_property_id"].astype(str).values)].copy()
new_properties["Full Address"] = (
new_properties["#"].astype(str) + ", " +
new_properties["Adress Line 1"].astype(str) + ", " +
new_properties["Postcode"].astype(str)
)
new_properties = new_properties[missed_columns.keys()].rename(columns=missed_columns)
new_properties["WFT Findings"] = "Property Not Inspected"
new_properties["ECO Eligibility"] = "Property Not Inspected"
new_properties["Data Source"] = "13.05.2025 Data Update"
master_list = pd.concat([new_properties, master_list])
# We append any new data on heating system, heating type, and insulation type, based on the data update
master_list = master_list.merge(
data_update[["UPRN", "Heating Type", "Assumed mm ", "SAP"]].rename(
columns={
"Heating Type": "heating_type_updated",
"Assumed mm ": "assumed_loft_insulation_thickness_updated",
"SAP": "sap_rating_updated"
}
),
how="left",
left_on="thrive_property_id",
right_on="UPRN"
)
# We fill the missings
master_list["heating_type_updated"] = master_list["heating_type_updated"].fillna(master_list["heating_type"])
master_list["assumed_loft_insulation_thickness_updated"] = master_list[
"assumed_loft_insulation_thickness_updated"
].fillna(master_list["assumed_loft_insulation_thickness"])
master_list["sap_rating_updated"] = master_list["sap_rating_updated"].fillna(master_list["sap_rating"])
assert not master_list["thrive_property_id"].duplicated().sum(), "Duplicate thrive_property_id found in master_list"
master_list["Address in tracker"] = master_list["thrive_property_id"].astype(str).isin(
thrive_tracker["UPRN"].astype(str).values
)
# Those the asset list - call it master asset list updated May2025
master_list = master_list.drop(columns=["UPRN"])
master_list["thrive_property_id"] = master_list["thrive_property_id"].astype(str)
# master_list.to_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List - "
# "Complete - Updated May 2025.xlsx",
# )
master_list["house_number_TEMP"] = master_list.apply(
lambda x: SearchEpc.get_house_number(address=x["full_address"], postcode=x["postcode"]),
axis=1
)
# We add in the status of the property
# TODO: Add the status of the property from the Thrive tracker
outcomes = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive - Outcomes - April "
"24-March25.xlsx",
header=0
)
outcomes["row_id"] = outcomes.index
# We have two ids which have the same phohe. nymber, but different UPRN, so we don't match to the tracker for these
tracker_for_matching = thrive_tracker[
~thrive_tracker["UPRN"].isin(['OAKGRE0065ABBLDW1', 'OAKGRE0066ABBLDW1', 'JACKET0102ABBLDW1', 'BELLCL0008BEDMDW1'])
].copy()
tracker_for_matching["Full Address"] = (
tracker_for_matching["#"].astype(str) + ", " +
tracker_for_matching["Adress Line 1"].astype(str) + ", " +
tracker_for_matching["Postcode"].astype(str)
)
outcomes_id_lookup = []
for _, x in tqdm(outcomes.iterrows(), total=len(outcomes)):
hn = str(x["No."])
address = x["Address"]
postcode = x["Postcode"]
contact_no = str(x["Contact No"]) if not pd.isnull(x["Contact No"]) else str(x["Contact No.1"])
contact_no = None if contact_no == "nan" else contact_no
if address == "292 Micklefield Road":
hn = "292"
if (address == "Micklefield Road") & (hn == "302"):
hn = "292"
if (address == "103a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
hn = "103a"
if (address == "105a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
hn = "105a"
if (address == "107a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
hn = "107a"
#
# # We match this to the tracker
# m1 = tracker_for_matching[tracker_for_matching["Primary Number"].astype(str) == contact_no]
# # Many of the phone numbers don't have a leading zero in the tracker so we add them
# if (m1.shape[0] != 1) and not pd.isnull(contact_no):
# m1 = tracker_for_matching[tracker_for_matching["Primary Number"].astype(str) == contact_no.lstrip("0")]
#
# if m1.shape[0] > 1:
# raise ValueError(
# f"Error for {hn} - {address} - {postcode} - {contact_no} in the tracker"
# )
# if m1.empty:
m1 = tracker_for_matching[
(tracker_for_matching["#"].astype(str) == hn) &
(tracker_for_matching["Postcode"] == postcode)
]
if m1.empty:
# Some properties aren't in the tracker, we match to the master list
m1 = master_list[
(master_list["house_number_TEMP"].astype(str) == hn) &
(master_list["postcode"] == postcode)
]
outcomes_id_lookup.append(
{
"row_id": x["row_id"],
"thrive_property_id": m1["thrive_property_id"].values[0],
"address": m1["full_address"].values[0],
"postcode": m1["postcode"].values[0],
}
)
continue
if m1.shape[0] != 1:
raise ValueError(
f"Error for {hn} - {address} - {postcode} - {contact_no} in the tracker"
)
# We add the status to the master list
outcomes_id_lookup.append(
{
"row_id": x["row_id"],
"thrive_property_id": m1["UPRN"].values[0],
"address": m1["Full Address"].values[0],
"postcode": m1["Postcode"].values[0],
}
)
outcomes_id_lookup = pd.DataFrame(outcomes_id_lookup)
outcomes = outcomes.merge(
outcomes_id_lookup,
how="left",
left_on="row_id",
right_on="row_id"
)
outcomes = outcomes.drop(columns=["row_id"])
outcomes = outcomes.rename(
columns={
"Outcomes": "Outcome",
"Notes (If 'no "
"answer' under outcomes, have you checked around the property for access issues where possible?)": "Notes",
}
)
# Store the corrected outcomes
# outcomes.to_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive - Outcomes -
# April 24-March25 - Corrected.xlsx",
# index=False
# )
def parse_date(value):
# Strip any 'W.C' or 'w/c' prefix and clean whitespace
value = value.strip().lower().replace('w.c', '').replace('w/c', '').strip()
try:
# Try parsing the date with dayfirst=True
return pd.to_datetime(value, dayfirst=True, errors='coerce')
except Exception:
return pd.NaT
outcomes['Parsed Date'] = outcomes['Date letters sent'].apply(parse_date)
# Next step - match the submissions master to the asset list. We will append on the UPRN
eco3_submissions = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
"ECO3.csv",
header=0
)
eco3_submissions["row_id"] = eco3_submissions.index
eco4_submissions = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
"ECO4.csv",
header=0
)
eco4_submissions["row_id"] = eco4_submissions.index
# List of properties never on the asset list
not_on_master = [
"7+FOXGROVE PATH+WD19 6YL", "9+FOXGROVE PATH+WD19 6YL", "11+FOXGROVE PATH+WD19 6YL",
"20+LINCOLN DRIVE+WD19 7BA", "22+LINCOLN DRIVE+WD19 7BA", "24+LINCOLN DRIVE+WD19 7BA",
"26+LINCOLN DRIVE+WD19 7BA", "1+Ryman Court, Stag Lane+WD3 5HN", "6+Ryman Court, Stag Lane+WD3 5HN",
"9+Ryman Court, Stag Lane+WD3 5HN", "10+Ryman Court, Stag Lane+WD3 5HN", "11+Ryman Court, Stag Lane+WD3 5HN",
"12+Ryman Court, Stag Lane+WD3 5HN", "14+Ryman Court, Stag Lane+WD3 5HN", "15+Ryman Court, Stag Lane+WD3 5HN",
"20+Ryman Court, Stag Lane+WD3 5HN", "21+Ryman Court, Stag Lane+WD3 5HN", "22+Ryman Court, Stag Lane+WD3 5HN",
"25+Ryman Court, Stag Lane+WD3 5HN", "26+Ryman Court, Stag Lane+WD3 5HN", "31+Ryman Court, Stag Lane+WD3 5HN",
"33+Ryman Court, Stag Lane+WD3 5HN", "34+Ryman Court, Stag Lane+WD3 5HN",
'37+Ryman Court, Stag Lane+WD3 5HN', '38+Ryman Court, Stag Lane+WD3 5HN', '39+Ryman Court, Stag Lane+WD3 5HN',
'41+Ryman Court, Stag Lane+WD3 5HN', '43+Ryman Court, Stag Lane+WD3 5HN', '45+Ryman Court, Stag Lane+WD3 5HN',
'46+Ryman Court, Stag Lane+WD3 5HN', '48+Ryman Court, Stag Lane+WD3 5HN', '49+Ryman Court, Stag Lane+WD3 5HN',
'50+Ryman Court, Stag Lane+WD3 5HN', '52+Ryman Court, Stag Lane+WD3 5HN'
]
eco3_remap = {
"19+OAKHILL ROAD+WD5 8RE": ('19', 'OAKHILL ROAD', 'WD3 9RE'),
"29+OAKHILL ROAD+WD5 8RE": ('29', 'OAKHILL ROAD', 'WD3 9RE'),
"31+OAKHILL ROAD+WD5 8RE": ('31', 'OAKHILL ROAD', 'WD3 9RE'),
"44+OAKHILL ROAD+WD5 8RE": ('44', 'OAKHILL ROAD', 'WD3 9RF'),
"64+OAKHILL ROAD+WD4 8RF": ('64', 'OAKHILL ROAD', 'WD3 9RF'),
"11+LANCASTER WAY+WD3 PRE": ('11', 'LANCASTER WAY', 'WD5 0PQ'),
"16+LANCASTER WAY+WD3 PRE": ('16', 'LANCASTER WAY', 'WD5 0PQ'),
"58+TALBOT ROAD +WD31HE": ('58', 'TALBOT ROAD', 'WD3 1HE'),
"10+PEARTREE COURT/WELWYN GARDEN CITY+AL73XN": ('10', 'PEARTREE COURT/WELWYN GARDEN CITY', 'AL7 3XN'),
"25+GOBLINS GREEN/WELWYN GARDEN CITY+AL73ST": ('25', 'GOBLINS GREEN/WELWYN GARDEN CITY', 'AL7 3ST'),
"32+GOBLINS GREEN/WELWYN GARDEN CITY+AL73ST": ('32', 'GOBLINS GREEN/WELWYN GARDEN CITY', 'AL7 3ST'),
"94+BAKER ST/POTTERS BAR+EN62EP": ('94', 'BAKER ST/POTTERS BAR', 'EN6 2EP'),
'33+Tudor Way+WD3JA': ('33', 'Tudor Way', 'WD3 8JA'),
'120+Hazlewood lane +WD5 0HF': ('120', 'Hazlewood lane', 'WD5 0HE'),
'35+Rosehill gardens +WD5 0HE': ('35', 'Rosehill gardens', 'WD5 0HF'),
'18+Rosehill gardens +WD5 0HE': ('18', 'Rosehill gardens', 'WD5 0HF'),
'34+Rosehill gardens +WD5 0HE': ('34', 'Rosehill gardens', 'WD5 0HF'),
'58+Rosehill gardens +WD5 0HE': ('58', 'Rosehill gardens', 'WD5 0HF'),
'48+Rosehill gardens +WD5 0HE': ('48', 'Rosehill gardens', 'WD5 0HF'),
'45+Rosehill gardens +WD5 0HE': ('45', 'Rosehill gardens', 'WD5 0HF'),
'6+Rosehill gardens +WD5 0HE': ('6', 'Rosehill gardens', 'WD5 0HF'),
'2+Rosehill gardens +WD5 0HE': ('2', 'Rosehill gardens', 'WD5 0HF'),
'29+Rosehill gardens +WD5 0HE': ('29', 'Rosehill gardens', 'WD5 0HF'),
'61+GOLDEN DELL+AL8 4EE': ('61', 'GOLDEN DELL', 'AL7 4EE'),
'2O+EDINBURGH AVENUE+WD3 8LB': ('20', 'EDINBURGH AVENUE', 'WD3 8LB'),
}
eco3_lookup = []
for _, row in tqdm(eco3_submissions.iterrows(), total=len(eco3_submissions)):
hn = row["NO "]
pc = row["Post Code"]
street = row["Street / Block Name"]
key = f"{hn}+{street}+{pc}"
if key in not_on_master:
continue
if key in eco3_remap:
hn, street, pc = eco3_remap[key]
# The postcode is different to the asse
# We filter the asset list, because it's hard to know how accurate this is
m1 = master_list[
(master_list["house_number_TEMP"].astype(str) == hn) &
(master_list["postcode"] == pc)
]
if m1.shape[0] != 1:
raise ValueError(
f"Error for {key} in the tracker"
)
eco3_lookup.append(
{
"row_id": row["row_id"],
"thrive_property_id": m1["thrive_property_id"].values[0],
"submission_house_number": row["NO "],
"submission_address1": row["Street / Block Name"],
"submission_postcode": row["Post Code"],
}
)
eco4_lookup = []
for _, row in tqdm(eco4_submissions.iterrows(), total=len(eco4_submissions)):
hn = row["NO."]
pc = row["Post Code"]
street = row["Street / Block Name"]
key = f"{hn}+{street}+{pc}"
if key in not_on_master:
continue
if key in eco3_remap:
hn, street, pc = eco3_remap[key]
# The postcode is different to the asse
# We filter the asset list, because it's hard to know how accurate this is
m1 = master_list[
(master_list["house_number_TEMP"].astype(str) == hn) &
(master_list["postcode"].str.lower() == pc.lower())
]
if m1.shape[0] != 1:
raise ValueError(
f"Error for {key} in the tracker"
)
eco4_lookup.append(
{
"row_id": row["row_id"],
"thrive_property_id": m1["thrive_property_id"].values[0],
"submission_house_number": row["NO."],
"submission_address1": row["Street / Block Name"],
"submission_postcode": row["Post Code"],
}
)
# We match the lookups back to the submission sheets
eco3_lookup = pd.DataFrame(eco3_lookup)
eco3_submissions = eco3_submissions.merge(
eco3_lookup,
how="left",
on="row_id",
)
eco4_lookup = pd.DataFrame(eco4_lookup)
eco4_submissions = eco4_submissions.merge(
eco4_lookup,
how="left",
on="row_id",
)
# Store
eco3_submissions.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
"ECO3 - with IDS.csv",
index=False
)
eco4_submissions.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
"ECO4 - with IDS.csv",
index=False
)