mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
439 lines
17 KiB
Python
439 lines
17 KiB
Python
"""
|
|
The Thrive programme has not been completed to specification. This script re-builds the programme and attempts to
|
|
address the following concerns:
|
|
- Which properties have been surveyed
|
|
- Of the properties that have been surveyed, what has been installed
|
|
- Which properties have been visited
|
|
|
|
"""
|
|
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
from backend.SearchEpc import SearchEpc
|
|
|
|
# This is Thrive's list of properties and when they should have been surveyed
|
|
thrive_tracker = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List - "
|
|
"Standardised.xlsx",
|
|
sheet_name="Tracker",
|
|
header=2
|
|
)
|
|
|
|
original_asset_list = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List.xlsx",
|
|
header=0
|
|
)
|
|
|
|
# Find properties that are on the thrive tracker that
|
|
missed_properties = thrive_tracker[
|
|
~thrive_tracker["UPRN"].astype(str).isin(original_asset_list["Client ref 1"].astype(str).values)
|
|
].copy()
|
|
|
|
# We produce the complete list, with all of the columns we need, for data standardisation
|
|
|
|
original_columns = {
|
|
"Client ref 1": "thrive_property_id",
|
|
"Address": "full_address",
|
|
"Address Line 1": "address_line_1",
|
|
"Address Line 2": "address_line_2",
|
|
"Address Line 3": "address_line_3",
|
|
"Address Line 4": "address_line_4",
|
|
"County": "county",
|
|
"Postcode": "postcode",
|
|
"Block Name": "block_reference",
|
|
"Construction Year": "construction_year",
|
|
"Age band (calculated)": "age_band_calculated",
|
|
"Property type": "property_type",
|
|
"Client property type 1": "detailed_property_type",
|
|
"Client property type 2": "detailed_property_type_2",
|
|
"bed count": "number_of_bedrooms",
|
|
"Heating Type": "heating_type",
|
|
"WFT Findings": "WFT Findings",
|
|
"ECO Eligibility": "ECO Eligibility",
|
|
}
|
|
|
|
original_asset_list = original_asset_list[original_columns.keys()].rename(columns=original_columns)
|
|
original_asset_list["Data Source"] = "Original Asset List"
|
|
original_asset_list = original_asset_list.drop_duplicates()
|
|
|
|
# We append on the missed properties, with the information we have
|
|
missed_properties["Full Address"] = (
|
|
missed_properties["#"].astype(str) + ", " +
|
|
missed_properties["Adress Line 1"].astype(str) + ", " +
|
|
missed_properties["Postcode"].astype(str)
|
|
)
|
|
missed_columns = {
|
|
"UPRN": "thrive_property_id",
|
|
"Full Address": "full_address",
|
|
"Short Address": "address_line_1",
|
|
"Postcode": "postcode",
|
|
"Property Type": "property_type",
|
|
"Build Form": "build_form",
|
|
"Build year": "age_band_calculated",
|
|
"Assumed mm ": "assumed_loft_insulation_thickness",
|
|
"SAP": "sap_rating",
|
|
}
|
|
|
|
missed_properties = missed_properties[missed_columns.keys()].rename(columns=missed_columns)
|
|
missed_properties["WFT Findings"] = "Property Not Inspected"
|
|
missed_properties["ECO Eligibility"] = "Property Not Inspected"
|
|
missed_properties["Data Source"] = "Thrive Tracker"
|
|
|
|
# We de-dupe ides in original_asset_list
|
|
dupe_ids = original_asset_list[original_asset_list["thrive_property_id"].duplicated()]["thrive_property_id"].unique()
|
|
dupes = original_asset_list[
|
|
original_asset_list["thrive_property_id"].isin(dupe_ids)
|
|
].copy()
|
|
dupes = dupes.sort_values("thrive_property_id")
|
|
|
|
original_asset_list = original_asset_list.rename(
|
|
columns={
|
|
"detailed_property_type": "build_form"
|
|
}
|
|
)
|
|
|
|
master_list = pd.concat([missed_properties, original_asset_list], ignore_index=True)
|
|
|
|
# We were provided with a data update for a sample of properties. We update the data with this information
|
|
data_update = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Property List "
|
|
"13_05.xlsx",
|
|
header=0
|
|
)
|
|
|
|
new_properties = data_update[~data_update["UPRN"].isin(master_list["thrive_property_id"].astype(str).values)].copy()
|
|
new_properties["Full Address"] = (
|
|
new_properties["#"].astype(str) + ", " +
|
|
new_properties["Adress Line 1"].astype(str) + ", " +
|
|
new_properties["Postcode"].astype(str)
|
|
)
|
|
new_properties = new_properties[missed_columns.keys()].rename(columns=missed_columns)
|
|
new_properties["WFT Findings"] = "Property Not Inspected"
|
|
new_properties["ECO Eligibility"] = "Property Not Inspected"
|
|
new_properties["Data Source"] = "13.05.2025 Data Update"
|
|
|
|
master_list = pd.concat([new_properties, master_list])
|
|
|
|
# We append any new data on heating system, heating type, and insulation type, based on the data update
|
|
master_list = master_list.merge(
|
|
data_update[["UPRN", "Heating Type", "Assumed mm ", "SAP"]].rename(
|
|
columns={
|
|
"Heating Type": "heating_type_updated",
|
|
"Assumed mm ": "assumed_loft_insulation_thickness_updated",
|
|
"SAP": "sap_rating_updated"
|
|
}
|
|
),
|
|
how="left",
|
|
left_on="thrive_property_id",
|
|
right_on="UPRN"
|
|
)
|
|
|
|
# We fill the missings
|
|
master_list["heating_type_updated"] = master_list["heating_type_updated"].fillna(master_list["heating_type"])
|
|
master_list["assumed_loft_insulation_thickness_updated"] = master_list[
|
|
"assumed_loft_insulation_thickness_updated"
|
|
].fillna(master_list["assumed_loft_insulation_thickness"])
|
|
master_list["sap_rating_updated"] = master_list["sap_rating_updated"].fillna(master_list["sap_rating"])
|
|
|
|
assert not master_list["thrive_property_id"].duplicated().sum(), "Duplicate thrive_property_id found in master_list"
|
|
|
|
master_list["Address in tracker"] = master_list["thrive_property_id"].astype(str).isin(
|
|
thrive_tracker["UPRN"].astype(str).values
|
|
)
|
|
|
|
# Those the asset list - call it master asset list updated May2025
|
|
master_list = master_list.drop(columns=["UPRN"])
|
|
master_list["thrive_property_id"] = master_list["thrive_property_id"].astype(str)
|
|
# master_list.to_excel(
|
|
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Asset List - "
|
|
# "Complete - Updated May 2025.xlsx",
|
|
# )
|
|
|
|
master_list["house_number_TEMP"] = master_list.apply(
|
|
lambda x: SearchEpc.get_house_number(address=x["full_address"], postcode=x["postcode"]),
|
|
axis=1
|
|
)
|
|
|
|
# We add in the status of the property
|
|
# TODO: Add the status of the property from the Thrive tracker
|
|
outcomes = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive - Outcomes - April "
|
|
"24-March25.xlsx",
|
|
header=0
|
|
)
|
|
outcomes["row_id"] = outcomes.index
|
|
|
|
# We have two ids which have the same phohe. nymber, but different UPRN, so we don't match to the tracker for these
|
|
tracker_for_matching = thrive_tracker[
|
|
~thrive_tracker["UPRN"].isin(['OAKGRE0065ABBLDW1', 'OAKGRE0066ABBLDW1', 'JACKET0102ABBLDW1', 'BELLCL0008BEDMDW1'])
|
|
].copy()
|
|
tracker_for_matching["Full Address"] = (
|
|
tracker_for_matching["#"].astype(str) + ", " +
|
|
tracker_for_matching["Adress Line 1"].astype(str) + ", " +
|
|
tracker_for_matching["Postcode"].astype(str)
|
|
)
|
|
|
|
outcomes_id_lookup = []
|
|
for _, x in tqdm(outcomes.iterrows(), total=len(outcomes)):
|
|
|
|
hn = str(x["No."])
|
|
address = x["Address"]
|
|
postcode = x["Postcode"]
|
|
contact_no = str(x["Contact No"]) if not pd.isnull(x["Contact No"]) else str(x["Contact No.1"])
|
|
contact_no = None if contact_no == "nan" else contact_no
|
|
|
|
if address == "292 Micklefield Road":
|
|
hn = "292"
|
|
|
|
if (address == "Micklefield Road") & (hn == "302"):
|
|
hn = "292"
|
|
|
|
if (address == "103a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
|
|
hn = "103a"
|
|
|
|
if (address == "105a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
|
|
hn = "105a"
|
|
|
|
if (address == "107a Norfolk Road Rickmansworth Hertfordshire WD3 1JY"):
|
|
hn = "107a"
|
|
|
|
#
|
|
# # We match this to the tracker
|
|
# m1 = tracker_for_matching[tracker_for_matching["Primary Number"].astype(str) == contact_no]
|
|
# # Many of the phone numbers don't have a leading zero in the tracker so we add them
|
|
# if (m1.shape[0] != 1) and not pd.isnull(contact_no):
|
|
# m1 = tracker_for_matching[tracker_for_matching["Primary Number"].astype(str) == contact_no.lstrip("0")]
|
|
#
|
|
# if m1.shape[0] > 1:
|
|
# raise ValueError(
|
|
# f"Error for {hn} - {address} - {postcode} - {contact_no} in the tracker"
|
|
# )
|
|
|
|
# if m1.empty:
|
|
m1 = tracker_for_matching[
|
|
(tracker_for_matching["#"].astype(str) == hn) &
|
|
(tracker_for_matching["Postcode"] == postcode)
|
|
]
|
|
|
|
if m1.empty:
|
|
# Some properties aren't in the tracker, we match to the master list
|
|
m1 = master_list[
|
|
(master_list["house_number_TEMP"].astype(str) == hn) &
|
|
(master_list["postcode"] == postcode)
|
|
]
|
|
outcomes_id_lookup.append(
|
|
{
|
|
"row_id": x["row_id"],
|
|
"thrive_property_id": m1["thrive_property_id"].values[0],
|
|
"address": m1["full_address"].values[0],
|
|
"postcode": m1["postcode"].values[0],
|
|
}
|
|
)
|
|
continue
|
|
|
|
if m1.shape[0] != 1:
|
|
raise ValueError(
|
|
f"Error for {hn} - {address} - {postcode} - {contact_no} in the tracker"
|
|
)
|
|
|
|
# We add the status to the master list
|
|
outcomes_id_lookup.append(
|
|
{
|
|
"row_id": x["row_id"],
|
|
"thrive_property_id": m1["UPRN"].values[0],
|
|
"address": m1["Full Address"].values[0],
|
|
"postcode": m1["Postcode"].values[0],
|
|
}
|
|
)
|
|
|
|
outcomes_id_lookup = pd.DataFrame(outcomes_id_lookup)
|
|
outcomes = outcomes.merge(
|
|
outcomes_id_lookup,
|
|
how="left",
|
|
left_on="row_id",
|
|
right_on="row_id"
|
|
)
|
|
|
|
outcomes = outcomes.drop(columns=["row_id"])
|
|
outcomes = outcomes.rename(
|
|
columns={
|
|
"Outcomes": "Outcome",
|
|
"Notes (If 'no "
|
|
"answer' under outcomes, have you checked around the property for access issues where possible?)": "Notes",
|
|
}
|
|
)
|
|
|
|
|
|
# Store the corrected outcomes
|
|
# outcomes.to_excel(
|
|
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive - Outcomes -
|
|
# April 24-March25 - Corrected.xlsx",
|
|
# index=False
|
|
# )
|
|
|
|
|
|
def parse_date(value):
|
|
# Strip any 'W.C' or 'w/c' prefix and clean whitespace
|
|
value = value.strip().lower().replace('w.c', '').replace('w/c', '').strip()
|
|
try:
|
|
# Try parsing the date with dayfirst=True
|
|
return pd.to_datetime(value, dayfirst=True, errors='coerce')
|
|
except Exception:
|
|
return pd.NaT
|
|
|
|
|
|
outcomes['Parsed Date'] = outcomes['Date letters sent'].apply(parse_date)
|
|
|
|
# Next step - match the submissions master to the asset list. We will append on the UPRN
|
|
eco3_submissions = pd.read_csv(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
|
|
"ECO3.csv",
|
|
header=0
|
|
)
|
|
eco3_submissions["row_id"] = eco3_submissions.index
|
|
|
|
eco4_submissions = pd.read_csv(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
|
|
"ECO4.csv",
|
|
header=0
|
|
)
|
|
eco4_submissions["row_id"] = eco4_submissions.index
|
|
|
|
# List of properties never on the asset list
|
|
not_on_master = [
|
|
"7+FOXGROVE PATH+WD19 6YL", "9+FOXGROVE PATH+WD19 6YL", "11+FOXGROVE PATH+WD19 6YL",
|
|
"20+LINCOLN DRIVE+WD19 7BA", "22+LINCOLN DRIVE+WD19 7BA", "24+LINCOLN DRIVE+WD19 7BA",
|
|
"26+LINCOLN DRIVE+WD19 7BA", "1+Ryman Court, Stag Lane+WD3 5HN", "6+Ryman Court, Stag Lane+WD3 5HN",
|
|
"9+Ryman Court, Stag Lane+WD3 5HN", "10+Ryman Court, Stag Lane+WD3 5HN", "11+Ryman Court, Stag Lane+WD3 5HN",
|
|
"12+Ryman Court, Stag Lane+WD3 5HN", "14+Ryman Court, Stag Lane+WD3 5HN", "15+Ryman Court, Stag Lane+WD3 5HN",
|
|
"20+Ryman Court, Stag Lane+WD3 5HN", "21+Ryman Court, Stag Lane+WD3 5HN", "22+Ryman Court, Stag Lane+WD3 5HN",
|
|
"25+Ryman Court, Stag Lane+WD3 5HN", "26+Ryman Court, Stag Lane+WD3 5HN", "31+Ryman Court, Stag Lane+WD3 5HN",
|
|
"33+Ryman Court, Stag Lane+WD3 5HN", "34+Ryman Court, Stag Lane+WD3 5HN",
|
|
'37+Ryman Court, Stag Lane+WD3 5HN', '38+Ryman Court, Stag Lane+WD3 5HN', '39+Ryman Court, Stag Lane+WD3 5HN',
|
|
'41+Ryman Court, Stag Lane+WD3 5HN', '43+Ryman Court, Stag Lane+WD3 5HN', '45+Ryman Court, Stag Lane+WD3 5HN',
|
|
'46+Ryman Court, Stag Lane+WD3 5HN', '48+Ryman Court, Stag Lane+WD3 5HN', '49+Ryman Court, Stag Lane+WD3 5HN',
|
|
'50+Ryman Court, Stag Lane+WD3 5HN', '52+Ryman Court, Stag Lane+WD3 5HN'
|
|
]
|
|
|
|
eco3_remap = {
|
|
"19+OAKHILL ROAD+WD5 8RE": ('19', 'OAKHILL ROAD', 'WD3 9RE'),
|
|
"29+OAKHILL ROAD+WD5 8RE": ('29', 'OAKHILL ROAD', 'WD3 9RE'),
|
|
"31+OAKHILL ROAD+WD5 8RE": ('31', 'OAKHILL ROAD', 'WD3 9RE'),
|
|
"44+OAKHILL ROAD+WD5 8RE": ('44', 'OAKHILL ROAD', 'WD3 9RF'),
|
|
"64+OAKHILL ROAD+WD4 8RF": ('64', 'OAKHILL ROAD', 'WD3 9RF'),
|
|
"11+LANCASTER WAY+WD3 PRE": ('11', 'LANCASTER WAY', 'WD5 0PQ'),
|
|
"16+LANCASTER WAY+WD3 PRE": ('16', 'LANCASTER WAY', 'WD5 0PQ'),
|
|
"58+TALBOT ROAD +WD31HE": ('58', 'TALBOT ROAD', 'WD3 1HE'),
|
|
"10+PEARTREE COURT/WELWYN GARDEN CITY+AL73XN": ('10', 'PEARTREE COURT/WELWYN GARDEN CITY', 'AL7 3XN'),
|
|
"25+GOBLINS GREEN/WELWYN GARDEN CITY+AL73ST": ('25', 'GOBLINS GREEN/WELWYN GARDEN CITY', 'AL7 3ST'),
|
|
"32+GOBLINS GREEN/WELWYN GARDEN CITY+AL73ST": ('32', 'GOBLINS GREEN/WELWYN GARDEN CITY', 'AL7 3ST'),
|
|
"94+BAKER ST/POTTERS BAR+EN62EP": ('94', 'BAKER ST/POTTERS BAR', 'EN6 2EP'),
|
|
'33+Tudor Way+WD3JA': ('33', 'Tudor Way', 'WD3 8JA'),
|
|
'120+Hazlewood lane +WD5 0HF': ('120', 'Hazlewood lane', 'WD5 0HE'),
|
|
'35+Rosehill gardens +WD5 0HE': ('35', 'Rosehill gardens', 'WD5 0HF'),
|
|
'18+Rosehill gardens +WD5 0HE': ('18', 'Rosehill gardens', 'WD5 0HF'),
|
|
'34+Rosehill gardens +WD5 0HE': ('34', 'Rosehill gardens', 'WD5 0HF'),
|
|
'58+Rosehill gardens +WD5 0HE': ('58', 'Rosehill gardens', 'WD5 0HF'),
|
|
'48+Rosehill gardens +WD5 0HE': ('48', 'Rosehill gardens', 'WD5 0HF'),
|
|
'45+Rosehill gardens +WD5 0HE': ('45', 'Rosehill gardens', 'WD5 0HF'),
|
|
'6+Rosehill gardens +WD5 0HE': ('6', 'Rosehill gardens', 'WD5 0HF'),
|
|
'2+Rosehill gardens +WD5 0HE': ('2', 'Rosehill gardens', 'WD5 0HF'),
|
|
'29+Rosehill gardens +WD5 0HE': ('29', 'Rosehill gardens', 'WD5 0HF'),
|
|
'61+GOLDEN DELL+AL8 4EE': ('61', 'GOLDEN DELL', 'AL7 4EE'),
|
|
'2O+EDINBURGH AVENUE+WD3 8LB': ('20', 'EDINBURGH AVENUE', 'WD3 8LB'),
|
|
}
|
|
|
|
eco3_lookup = []
|
|
for _, row in tqdm(eco3_submissions.iterrows(), total=len(eco3_submissions)):
|
|
hn = row["NO "]
|
|
pc = row["Post Code"]
|
|
street = row["Street / Block Name"]
|
|
key = f"{hn}+{street}+{pc}"
|
|
if key in not_on_master:
|
|
continue
|
|
|
|
if key in eco3_remap:
|
|
hn, street, pc = eco3_remap[key]
|
|
# The postcode is different to the asse
|
|
|
|
# We filter the asset list, because it's hard to know how accurate this is
|
|
m1 = master_list[
|
|
(master_list["house_number_TEMP"].astype(str) == hn) &
|
|
(master_list["postcode"] == pc)
|
|
]
|
|
|
|
if m1.shape[0] != 1:
|
|
raise ValueError(
|
|
f"Error for {key} in the tracker"
|
|
)
|
|
|
|
eco3_lookup.append(
|
|
{
|
|
"row_id": row["row_id"],
|
|
"thrive_property_id": m1["thrive_property_id"].values[0],
|
|
"submission_house_number": row["NO "],
|
|
"submission_address1": row["Street / Block Name"],
|
|
"submission_postcode": row["Post Code"],
|
|
}
|
|
)
|
|
|
|
eco4_lookup = []
|
|
for _, row in tqdm(eco4_submissions.iterrows(), total=len(eco4_submissions)):
|
|
hn = row["NO."]
|
|
pc = row["Post Code"]
|
|
street = row["Street / Block Name"]
|
|
key = f"{hn}+{street}+{pc}"
|
|
if key in not_on_master:
|
|
continue
|
|
|
|
if key in eco3_remap:
|
|
hn, street, pc = eco3_remap[key]
|
|
# The postcode is different to the asse
|
|
|
|
# We filter the asset list, because it's hard to know how accurate this is
|
|
m1 = master_list[
|
|
(master_list["house_number_TEMP"].astype(str) == hn) &
|
|
(master_list["postcode"].str.lower() == pc.lower())
|
|
]
|
|
|
|
if m1.shape[0] != 1:
|
|
raise ValueError(
|
|
f"Error for {key} in the tracker"
|
|
)
|
|
|
|
eco4_lookup.append(
|
|
{
|
|
"row_id": row["row_id"],
|
|
"thrive_property_id": m1["thrive_property_id"].values[0],
|
|
"submission_house_number": row["NO."],
|
|
"submission_address1": row["Street / Block Name"],
|
|
"submission_postcode": row["Post Code"],
|
|
}
|
|
)
|
|
|
|
# We match the lookups back to the submission sheets
|
|
eco3_lookup = pd.DataFrame(eco3_lookup)
|
|
eco3_submissions = eco3_submissions.merge(
|
|
eco3_lookup,
|
|
how="left",
|
|
on="row_id",
|
|
)
|
|
|
|
eco4_lookup = pd.DataFrame(eco4_lookup)
|
|
eco4_submissions = eco4_submissions.merge(
|
|
eco4_lookup,
|
|
how="left",
|
|
on="row_id",
|
|
)
|
|
|
|
# Store
|
|
eco3_submissions.to_csv(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
|
|
"ECO3 - with IDS.csv",
|
|
index=False
|
|
)
|
|
eco4_submissions.to_csv(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thrive/Programme Reconciliation/Thrive Submissions "
|
|
"ECO4 - with IDS.csv",
|
|
index=False
|
|
)
|