Model/etl/customers/stonewater/shdf_3_clustering.py
Khalim Conn-Kowlessar 58cbef9a2b stonewater tweaks
2024-07-24 13:33:54 +01:00

2671 lines
101 KiB
Python

import json
from tqdm import tqdm
import os
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
import urllib.parse
import requests
from datetime import datetime
from scipy import stats
from fuzzywuzzy import fuzz
import numpy as np
import pandas as pd
import time
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances_argmin_min
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
# We create a MAP of uprns, for EPCs that didn't give use the UPRN
missing_uprn_map = [
# This is a map from internal_id to UPRN, for properties where we do have an EPC, but we don't have
# a uprn
# 1 Church Street, Alfreton, DE55 7AH
{"internal_id": 78, "mapped_uprn": None}, # Doesn't seem to exist any more
# 1 Granville Road, Luton, LU1 1PA
{"internal_id": 315, "mapped_uprn": 100080148856},
# 11 College Street, Birstall, Batley, WF17 9HF
# The EPC record is for 11 and 11a
{"internal_id": 1090, "mapped_uprn": 83190440},
# 11a College Street, Birstall, Batley, WF17 9HF
{"internal_id": 1092, "mapped_uprn": 83143766},
# Flat 5 Friars Street, Hereford, HR4 0AS
# TODO: Check this
{"internal_id": 1384, "mapped_uprn": 200002600892},
# This UPRN is for 5 Friars Court, which is a flat
# Flat 7 Friars Street, Hereford, HR4 0AS
# TODO: Check this
{"internal_id": 1385, "mapped_uprn": 200002600894},
# This UPRN is for 7 Friars Court, which is a flat
# 1 Waverley Street, Dudley, DY2 0YE
{"internal_id": 3349, "mapped_uprn": 90022438},
# 5 Brighton Road, Burgh Heath, Tadworth, KT20 6BQ
# TODO: Check this
# This UPRN is for 5 Copthorne, Brighton Road, Burgh Heath, KT20 6BQ, which is a flat
{"internal_id": 5027, "mapped_uprn": 100062145273},
# Room 1, 21 Coxford Road, Southampton, SO16 5FG
# This is for 21 Coxford Road
{"internal_id": 5554, "mapped_uprn": 100060692392},
]
missing_uprn_map = pd.DataFrame(missing_uprn_map)
internal_id_epcs_to_drop = [315, 1384, 1385, 3349]
def remove_commas_and_full_stops(input_string: str) -> str:
"""
Removes commas and full stops from the input string.
Args:
input_string (str): The string from which to remove commas and full stops.
Returns:
str: The string with commas and full stops removed.
"""
return input_string.replace(',', '').replace('.', '')
def get_places_with_retry(searcher, max_retries=5, wait_time=2):
"""
Tries to call the get_places_api method up to max_retries times,
with a wait_time interval between attempts in case of failure.
Args:
searcher (object): The searcher object with the ordnance_survey_client.
max_retries (int): Maximum number of retry attempts.
wait_time (int): Wait time in seconds between retries.
Returns:
result: The result from the get_places_api method or None if all attempts fail.
"""
for attempt in range(max_retries):
try:
response = searcher.ordnance_survey_client.get_places_api()
status = response.get("status")
if status == 200:
return response # Return the result if successful
else:
print(f"Attempt {attempt + 1} failed with status code: {status}")
except Exception as e:
print(f"Attempt {attempt + 1} failed with error: {e}")
if attempt < max_retries - 1:
print(f"Retrying in {wait_time} seconds...")
time.sleep(wait_time)
print(f"All {max_retries} attempts failed.")
return None
def app():
"""
This script handles the preparation of the data from Stonewater, to archetype a collection
of 5.3k properties and reduce that down to a representative set of 450 properties.
Here, we prepare the input data for clustering
:return:
"""
# TODO: Temp read from local machine - move to s3
# asset_list = pd.read_excel(
# "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
# )
asset_list = read_excel_from_s3(
file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
bucket_name="retrofit-data-dev",
header_row=4
)
# Drop the bottom 4 rows, which are completely missing
asset_list = asset_list.head(-4)
# Keep just the columns we're interested in
asset_list = asset_list[
[
"Osm. ID",
"Org. ref.",
"Postcode",
"House no",
"Name",
"Address line 2",
"City/Town",
"County",
"Address ID", # This is not uprn
]
].rename(
columns={
"Osm. ID": "internal_id",
"Org. ref.": "customer_asset_id",
"Postcode": "postcode",
"House no": "house_number",
"Name": "address1",
"Address line 2": "address2",
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
}
)
# Create full address
asset_list["full_address"] = np.where(
~pd.isnull(asset_list["address2"]),
(
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"].str.title() + ", " +
# asset_list["county"] + ", " +
asset_list["postcode"]
),
asset_list["address1"] + ", " +
asset_list["city_town"].str.title() + ", " +
# asset_list["county"] + ", " +
asset_list["postcode"]
)
if pd.isnull(asset_list["full_address"]).sum():
raise ValueError("Missing full addresses")
# Pull in the data
# This data has already been pulled as much as it can be, so we retrieve the existing extraction from S3
# Perform an initial pull without ordnance survey data
# epc_data = []
# older_epc_data = {}
#
# for row_number, asset in tqdm(asset_list.iterrows(), total=len(asset_list)):
# searcher = SearchEpc(
# address1=str(asset["address1"]),
# postcode=str(asset["postcode"]),
# auth_token=EPC_AUTH_TOKEN,
# os_api_key="",
# full_address=str(asset["full_address"]),
# uprn=asset.get("uprn", None),
# )
# searcher.find_property(skip_os=True)
#
# if searcher.newest_epc is None:
# continue
#
# epc_data.append(
# {
# "internal_id": asset["internal_id"],
# **searcher.newest_epc
# }
# )
#
# if searcher.older_epcs is not None:
# older_epc_data[asset["internal_id"]] = searcher.older_epcs
#
# # Store to S3
# save_data_to_s3(
# data=json.dumps(epc_data),
# s3_file_name="customers/Stonewater/clustering/epc_data.json",
# bucket_name="retrofit-data-dev"
# )
#
# save_data_to_s3(
# data=json.dumps(older_epc_data),
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json",
# bucket_name="retrofit-data-dev"
# )
# We read this directly from s3
epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/epc_data.json"
)
)
older_epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
)
)
# Perform a comparison between the EPC address and the asset list address, just to double check
epc_data_df = pd.DataFrame(epc_data)
address_comparison = (
asset_list[["internal_id", "full_address", "postcode", "house_number", "address1"]].merge(
epc_data_df[["internal_id", "address", "postcode", "address1"]].rename(
columns={
"address": "epc_address",
"postcode": "epc_postcode",
"address1": "epc_address1"
}
),
how="inner",
on="internal_id"
)
)
# Produce a metric, showing the matching confidence between the two
address_comparison["epc_extracted_house_number"] = address_comparison["epc_address1"].apply(
lambda x: SearchEpc.get_house_number(x)
)
address_comparison["house_numbers_match"] = (
address_comparison["house_number"].str.lower() == address_comparison["epc_extracted_house_number"].str.lower()
)
# We also produce a address similarity metric
# We convert the strings to lower and remove common punctuation
address_comparison["address_similarity_score"] = address_comparison.apply(
lambda x: fuzz.ratio(
remove_commas_and_full_stops(x["address1"].lower()),
remove_commas_and_full_stops(x["epc_address1"].lower())
),
axis=1
)
address_comparison = address_comparison.sort_values("address_similarity_score", ascending=True)
address_comparison = address_comparison[
["internal_id", "full_address", "epc_address", "address_similarity_score", "house_numbers_match"]
]
# Anything with less than a 90 similarity score, let's do again
needs_ordnance_survey = address_comparison[
(address_comparison["address_similarity_score"] <= 90) |
(~address_comparison["house_numbers_match"])
].copy()
is_ok = address_comparison[~address_comparison["internal_id"].isin(needs_ordnance_survey["internal_id"])]
is_ok = is_ok.sort_values("address_similarity_score", ascending=True)
os_data_pull_asset_list = asset_list[
~asset_list["internal_id"].isin(is_ok["internal_id"].values)
].copy()
# We have already done a partial pull of the Ordnance survey data so we can skip some of the records
# os_most_relevant_1 = json.loads(
# read_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/os_most_relevant_1.json"
# )
# )
#
# os_most_relevant_2 = json.loads(
# read_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/os_most_relevant_2.json"
# )
# )
#
# fetched_internal_ids = (
# [x["internal_id"] for x in os_most_relevant_1] + [x["internal_id"] for x in os_most_relevant_2]
# )
#
# # We remove any ids we've already fetched
# os_data_pull_asset_list = os_data_pull_asset_list[
# ~os_data_pull_asset_list["internal_id"].isin(fetched_internal_ids)
# ]
#
# # Our OK EPC data (is_ok) + ordnance survey fetched data + the data we need to fetch should equal the total
# # number of assets
# assert len(is_ok) + len(fetched_internal_ids) + len(os_data_pull_asset_list) == len(asset_list)
os_data_pull_asset_list = os_data_pull_asset_list.reset_index(drop=True)
# For each of these records, we pull the OS data
# ORDNANCE_SURVEY_API_KEY = "" # This API key is a temp key which I have copied locally
# os_most_relevant = []
# os_all = {}
# errors = []
# for _, asset in tqdm(os_data_pull_asset_list.iterrows(), total=len(os_data_pull_asset_list)):
# # Calls are throttled to 50 per minute in development mode, so lets just slow this down
# time.sleep(2)
#
# searcher = SearchEpc(
# address1=str(asset["address1"]),
# postcode=str(asset["postcode"]),
# auth_token=EPC_AUTH_TOKEN,
# os_api_key=ORDNANCE_SURVEY_API_KEY,
# full_address=str(asset["full_address"]),
# uprn=asset.get("uprn", None),
# )
# searcher.ordnance_survey_client.full_address = asset["full_address"]
# # Attempt to get places data with retry logic
# result = get_places_with_retry(searcher)
#
# if result:
# # Get the most relevant response
# os_most_relevant.append(
# {
# "internal_id": asset["internal_id"],
# **searcher.ordnance_survey_client.most_relevant_result
# }
# )
#
# # Also keep the best 100 results
# os_all[asset["internal_id"]] = searcher.ordnance_survey_client.results
# else:
# # Record the internal_id of the asset that failed
# print("Error for address: " + asset["full_address"])
# errors.append(asset["internal_id"])
# Store to S3
# save_data_to_s3(
# data=json.dumps(os_most_relevant),
# s3_file_name="customers/Stonewater/clustering/os_most_relevant_3.json",
# bucket_name="retrofit-data-dev"
# )
#
# save_data_to_s3(
# data=json.dumps(os_all),
# s3_file_name="customers/Stonewater/clustering/os_all_3.json",
# bucket_name="retrofit-data-dev"
# )
#
# save_data_to_s3(
# data=json.dumps(errors),
# s3_file_name="customers/Stonewater/clustering/errors_3.json",
# bucket_name="retrofit-data-dev"
# )
# We now collate all of the data for the following steps:
# 1) Checking the retrieve ordnance survey data against ordnance survey data
# 2) A second round of querying the EPC api to find the EPC data, in case we retrieve something using uprn
# 3) Predicting the EPC data for the properties we have no data for
# 4) Retrieveing additional data against the internal_id
# 5) Creation of final dataset for clustering
os_most_relevant = []
os_all = {}
for i in ["1", "2", "3"]:
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all = {**os_all, **json.loads(os_all_segment)}
os_most_relevant = pd.DataFrame(os_most_relevant)
os_address_comparison = os_data_pull_asset_list[
["internal_id", "full_address", "postcode", "house_number", "address1"]
].merge(
os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
how="inner",
on="internal_id"
)
# Compare house number
# Check for records where the postcode doesn't match
os_address_comparison["postcodes_match"] = (
os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower()
)
# extract it from ADDRESS
os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply(
lambda x: SearchEpc.get_house_number(x)
)
# Compare house number
os_address_comparison["house_numbers_match"] = (
os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower()
)
# String similarity
os_address_comparison["address_similarity_score"] = os_address_comparison.apply(
lambda x: fuzz.ratio(
remove_commas_and_full_stops(x["full_address"].lower()),
remove_commas_and_full_stops(x["ADDRESS"].lower())
),
axis=1
)
os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True)
problematic = os_address_comparison.copy()
problematic = problematic[
(problematic["address_similarity_score"] <= 80) |
(~problematic["house_numbers_match"]) |
(~problematic["postcodes_match"])
]
# TODO: We'll label these problematic records as problematic, in the final output
# different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True)
ORDNANCE_SURVEY_API_KEY = "" # This API key is a temp key which I have copied locally
problematic_os = []
problematic_os_all = {}
problematic_errors = []
for _, row in tqdm(problematic.iterrows(), total=len(problematic)):
# Let's just do a backup pull - we're now using LPI too
time.sleep(2)
backup_searher = SearchEpc(
address1=row["address1"],
postcode=row["postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key=ORDNANCE_SURVEY_API_KEY,
uprn=None,
)
# Attempt to get places data with retry logic
result = get_places_with_retry(backup_searher)
if result:
# Get the most relevant response
problematic_os.append(
{
"internal_id": row["internal_id"],
**backup_searher.ordnance_survey_client.most_relevant_result
}
)
# Also keep the best 100 results
problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results
else:
# Record the internal_id of the asset that failed
print("Error for address: " + row["full_address"])
problematic_errors.append(row["internal_id"])
# Store to S3
# save_data_to_s3(
# data=json.dumps(problematic_os),
# s3_file_name="customers/Stonewater/clustering/problematic_os.json",
# bucket_name="retrofit-data-dev"
# )
#
# save_data_to_s3(
# data=json.dumps(problematic_os_all),
# s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
# bucket_name="retrofit-data-dev"
# )
#
# save_data_to_s3(
# data=json.dumps(problematic_errors),
# s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
# bucket_name="retrofit-data-dev"
# )
# Next steps: We should collate all of the data and produce 1 big dataset
problematic_os_df = pd.DataFrame(problematic_os)
problematic_address_comparison = problematic[["internal_id", "full_address", "postcode", "house_number"]].merge(
problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
how="inner",
on="internal_id"
)
problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1]
problematic_address_comparison["postcodes_match"] = (
problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[
"OS_POSTCODE"].str.lower()
)
problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply(
lambda x: fuzz.ratio(
remove_commas_and_full_stops(x["full_address"].lower()),
remove_commas_and_full_stops(x["ADDRESS"].lower())
),
axis=1
)
problematic_address_comparison = problematic_address_comparison.sort_values(
"match_similarity_score", ascending=True
)
# let's do a house number extraction
problematic_address_comparison["extracted_house_number"] = problematic_address_comparison.apply(
lambda x: SearchEpc.get_house_number(x["ADDRESS"], x["OS_POSTCODE"]), axis=1
)
problematic_address_comparison["house_numbers_different"] = (
problematic_address_comparison["house_number"].str.lower().str.split(",").str[0].str.split(" ").str[0] !=
problematic_address_comparison[
"extracted_house_number"].str.lower()
)
# We perform a final check
# Take anything where the postcodes don't match, where the house numbers are different and the match similarity
# is less than 90, or the match similarity is less than 80
final_check = problematic_address_comparison[
(~problematic_address_comparison["postcodes_match"])
]
final_check = final_check.sort_values("match_similarity_score", ascending=False)
final_check = final_check.reset_index(drop=True)
final_best_matches = []
no_matches = []
for _, row in final_check.iterrows():
os_data = problematic_os_all[row["internal_id"]]
os_data = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
)
if ("POSTCODE_LOCATOR" in os_data.columns) and ("POSTCODE" in os_data.columns):
os_data["postcode"] = np.where(
~pd.isnull(os_data["POSTCODE"]),
os_data["POSTCODE"],
os_data["POSTCODE_LOCATOR"]
)
elif "POSTCODE" in os_data.columns:
os_data["postcode"] = os_data["POSTCODE"]
else:
os_data["postcode"] = os_data["POSTCODE_LOCATOR"]
os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
if os_data.shape[0] >= 1:
final_best_matches.append(
{
"internal_id": row["internal_id"],
**os_data.iloc[0].to_dict()
}
)
else:
no_matches.append(
{
"internal_id": row["internal_id"],
"full_address": row["full_address"],
"postcode": row["postcode"]
}
)
no_matches = pd.DataFrame(no_matches)
# Data to be confirmed
from etl.customers.stonewater.no_matches import no_matches
no_matches_to_export = pd.DataFrame(no_matches)
no_matches_to_export = asset_list.merge(
no_matches_to_export[["internal_id", "Note"]],
how="inner",
on="internal_id"
).rename(
columns={
"internal_id": "Osm. ID",
"customer_asset_id": "Org. ref.",
"external_address_id": "Address ID",
}
)
no_matches_to_export.to_excel("Stonewater - addresses with no matches.xlsx", index=False)
# We also confirm final_best_matches
final_best_matches_df = pd.DataFrame(final_best_matches)[
["internal_id", "ADDRESS", "UPRN"]
].rename(
columns={
"ADDRESS": "Ordnance Survey Address - same postcode (best match)",
"UPRN": "UPRN - same postcode (best match)"
}
)
# We also get their original match
final_best_matches_df = final_best_matches_df.merge(
problematic[["internal_id", "ADDRESS", "UPRN"]].rename(
columns={
"ADDRESS": "Ordnance Survey Address - best possible match",
"UPRN": "UPRN - best possible match"
}
),
how="inner",
on="internal_id"
)
# merge on the original data
final_best_matches_df = asset_list.merge(
final_best_matches_df,
how="inner",
on="internal_id"
).rename(
columns={
"internal_id": "Osm. ID",
"customer_asset_id": "Org. ref.",
"external_address_id": "Address ID",
}
)
# "Osm. ID": "internal_id",
# "Org. ref.": "customer_asset_id",
# "Postcode": "postcode",
# "House no": "house_number",
# "Name": "address1",
# "Address line 2": "address2",
# "City/Town": "city_town",
# "County": "county",
# "Address ID": "external_address_id",
def filter_os_data(p_os_data, p_os_data_all, udprn, is_flat):
if udprn is None:
p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
if is_flat:
p_os_data_all = p_os_data_all[p_os_data_all["CLASSIFICATION_CODE"] == "RD06"]
return p_os_data_all.head(1)
return p_os_data_all.head(1)
final_os_data = p_os_data[p_os_data["UDPRN"] == udprn]
if final_os_data.empty:
p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
final_os_data = p_os_data_all[p_os_data_all["UDPRN"].astype(str) == udprn]
return final_os_data
def compile_data():
"""
Various data sources have been produced to create the final data source for Stonewater.
This function combines them
:return:
"""
########################################################################
# Read in data
########################################################################
# asset_list = read_excel_from_s3(
# file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
# bucket_name="retrofit-data-dev",
# header_row=4
# )
#
# udprn_data = read_excel_from_s3(
# file_key="customers/Stonewater/UDPRN updated RA Sample for 5 year programme.xlsx",
# bucket_name="retrofit-data-dev",
# header_row=0
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
# Read in the lookups
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
)))
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
)))
uprn_lookup_2 = uprn_lookup_2.rename(
columns={
"epc_address": "standardised_address",
"epc_postcode": "standardised_postcode"
}
)
# concat
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
# TODO: Read in UPRNs or UDPRN
# UPRN LOOKUPS TO READ IN: address_uprn_udprn_lookup, address_uprn_udprn_lookup_2
epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/epc_data.json"
)
)
epc_data = pd.DataFrame(epc_data)
# We drop come EPCS
epc_data = epc_data[~epc_data["internal_id"].isin(internal_id_epcs_to_drop)]
# This we can use to produce additional variables such as number of old surveys
# older_epc_data = json.loads(
# read_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
# )
# )
# older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
# This is the first ordnance survey data pull
os_most_relevant_1 = []
os_all_1 = {}
for i in tqdm(["1", "2", "3"]):
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant_1.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
# This is the second ordnance survey data pull
os_most_relevant_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
)
os_most_relevant_2 = json.loads(os_most_relevant_2)
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
os_all_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
)
os_all_2 = json.loads(os_all_2)
########################################################################
# Prepare asset list
########################################################################
# TODO: Merge on UPRNs
# Keep just the columns we're interested in
asset_list = asset_list[
[
"Osm. ID",
"Org. ref.",
"Postcode",
"House no",
"Name",
"Address line 2",
"City/Town",
"County",
"Address ID", # This is not uprn
"udprn"
]
].rename(
columns={
"Osm. ID": "internal_id",
"Org. ref.": "customer_asset_id",
"Postcode": "postcode",
"House no": "house_number",
"Name": "address1",
"Address line 2": "address2",
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
}
)
# Create full address
asset_list["full_address"] = np.where(
~pd.isnull(asset_list["address2"]),
(
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"].str.title() + ", " +
# asset_list["county"] + ", " +
asset_list["postcode"]
),
asset_list["address1"] + ", " +
asset_list["city_town"].str.title() + ", " +
# asset_list["county"] + ", " +
asset_list["postcode"]
)
if pd.isnull(asset_list["full_address"]).sum():
raise ValueError("Missing full addresses")
# Merge on UDPRN
asset_list = asset_list.merge(
uprn_lookup.drop(columns=["udprn"]), how="left", on=["internal_id", "external_address_id"]
)
# This is everything without a uprn
# Quick check to see if we have os data for every property that doesn't have an EPC
without_epc = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
os_most_relevant_1_internal_ids = os_most_relevant_1["internal_id"].tolist()
os_most_relevant_2_internal_ids = os_most_relevant_2["internal_id"].tolist()
missing_os_data = []
for _, x in without_epc.iterrows():
# We would prioritise the data pulled the second time around
internal_id = x["internal_id"]
if internal_id in os_most_relevant_2_internal_ids:
continue
if internal_id in os_most_relevant_1_internal_ids:
continue
missing_os_data.append(internal_id)
if len(missing_os_data):
raise Exception("We don't have SOME data for each internal_id")
# Let's create a lookup table of internal_id, external_address_id, udprn, uprn, standardised_address
address_uprn_udprn_lookup = []
for _, x in without_epc.iterrows():
if pd.isnull(x["UDPRN"]):
continue
udprn = str(int(x["UDPRN"]))
internal_id = x["internal_id"]
is_flat = "flat" in x["address1"].lower()
# Get the OS data
final_os_data = pd.DataFrame()
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
p_os_data_all = os_all_1[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
p_os_data_all = os_all_2[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
if final_os_data.empty:
continue
if final_os_data.shape[0] != 1:
if final_os_data["UPRN"].nunique() > 1:
raise Exception("Investigate me")
address_uprn_udprn_lookup.append(
{
"internal_id": internal_id,
"external_address_id": x["external_address_id"],
"udprn": udprn,
"uprn": final_os_data["UPRN"].values[0],
"standardised_address": final_os_data["ADDRESS"].values[0],
"standardised_postcode": final_os_data["POSTCODE"].values[0]
}
)
# Store this lookup
# save_data_to_s3(
# data=json.dumps(address_uprn_udprn_lookup),
# s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json",
# bucket_name="retrofit-data-dev"
# )
address_uprn_udprn_lookup = pd.DataFrame(address_uprn_udprn_lookup)
missed = asset_list[~asset_list["internal_id"].isin(address_uprn_udprn_lookup["internal_id"].values)]
address_comparison = (
asset_list[
["internal_id", "external_address_id", "UDPRN", "full_address", "postcode", "house_number", "address1"]
].merge(
epc_data[["internal_id", "address", "postcode", "address1", "uprn"]].rename(
columns={
"address": "epc_address",
"postcode": "epc_postcode",
"address1": "epc_address1"
}
),
how="inner",
on="internal_id"
)
)
address_comparison["address_similarity_score"] = address_comparison.apply(
lambda x: fuzz.ratio(
remove_commas_and_full_stops(x["address1"].lower() + x["postcode"].lower()),
remove_commas_and_full_stops(x["epc_address1"].lower() + x["epc_postcode"].lower())
),
axis=1
)
address_comparison = address_comparison.sort_values("address_similarity_score", ascending=False)
# Cond
confident = address_comparison[address_comparison["address_similarity_score"] >= 95]
low_confidence = address_comparison[address_comparison["address_similarity_score"] < 95].copy()
lookup_2 = confident[
[
'internal_id', 'external_address_id', 'UDPRN', 'uprn',
'epc_address', 'epc_postcode']
].rename(columns={"UDPRN": "udprn"})
# Store in S3
# save_data_to_s3(
# data=json.dumps(lookup_2.to_dict("records")),
# s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json",
# bucket_name="retrofit-data-dev"
# )
# Need to deal with the low confidence records
low_confidence_asset_list = asset_list[asset_list["internal_id"].isin(low_confidence["internal_id"])]
for _, x in low_confidence_asset_list.iterrows():
udprn = str(int(x["UDPRN"]))
internal_id = x["internal_id"]
# Get the OS data
final_os_data = pd.DataFrame()
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
p_os_data_all = os_all_1[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
p_os_data_all = os_all_2[str(internal_id)]
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)
# For the EPC data, some of them are missing UPRN
epc_data = epc_data.merge(missing_uprn_map, how="left", on="internal_id")
epc_data["uprn"] = np.where(
epc_data["uprn"] == "",
epc_data["mapped_uprn"],
epc_data["uprn"]
)
epc_data = epc_data.drop(columns=["mapped_uprn"])
# Once we have UPRNs, we might want to pull in the EPC data again
# epc_data_with_uprn = []
# older_epc_data_with_uprn = {}
#
# for row_number, asset in tqdm(asset_list.iterrows(), total=len(asset_list)):
# searcher = SearchEpc(
# address1=str(asset["address1"]),
# postcode=str(asset["postcode"]),
# auth_token=EPC_AUTH_TOKEN,
# os_api_key="",
# full_address=str(asset["full_address"]),
# uprn=asset["uprn"]
# )
# searcher.find_property(skip_os=True)
#
# if searcher.newest_epc is None:
# continue
#
# epc_data_with_uprn.append(
# {
# "internal_id": asset["internal_id"],
# **searcher.newest_epc
# }
# )
#
# if searcher.older_epcs is not None:
# older_epc_data_with_uprn[asset["internal_id"]] = searcher.older_epcs
# We now get the remaining properties
# TODO: We might want to use epc_data_with_uprn
remaining_properties = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
# We estimate the data
final_epcs = []
for _, p in remaining_properties.iterrows():
internal_id = p["internal_id"]
uprn = p["UPRN"]
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id].to_dict("records")[0]
p_os_full = os_all_1[str(internal_id)]
else:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id].to_dict("records")[0]
p_os_full = os_all_2[str(internal_id)]
p_os_full = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in p_os_full]
)
# TODO: Add this back in
# When we have this
if p["uprn"] != p_os_data["UPRN"]:
# Get it from the older data
filtered = p_os_full[p_os_full["UPRN"] == p["uprn"]]
p_os_data = filtered.to_dict("records")[0]
searcher = SearchEpc(
address1=str(p["address1"]),
postcode=str(p["postcode"]),
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
uprn=uprn
)
searcher.ordnance_survey_client.parse_classification_code(p_os_data["CLASSIFICATION_CODE"])
searcher.find_property(skip_os=True)
final_epcs.append(
{
"internal_id": internal_id,
**searcher.newest_epc
}
)
final_epcs = pd.DataFrame(final_epcs)
complete_epcs = pd.concat(
[
epc_data,
final_epcs
]
)
# We now pull spatial data
# We get the spatial file list and loop through each EPC and determine which file it needs.
# We then just read in the files that we need and get the data, for each uprn from that file
uprn_filenames = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
)
uprn_lookup = {}
for uprn in complete_epcs["uprn"]:
if pd.isnull(uprn):
# TODO: Do something about this!
continue
filtered_df = uprn_filenames[
(uprn_filenames["lower"] <= int(uprn))
& (uprn_filenames["upper"] >= int(uprn))
]
if filtered_df["filenames"].values[0] in uprn_lookup:
uprn_lookup[filtered_df["filenames"].values[0]].append(int(uprn))
else:
uprn_lookup[filtered_df["filenames"].values[0]] = [int(uprn)]
spatial_data_to_uprn = []
for filename, associated_uprn in tqdm(uprn_lookup.items(), total=len(uprn_lookup)):
# Read in the file
spatial_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}"
)
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
spatial_data_to_uprn.append(spatial_df)
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
# TODO: Let's store this in s3
# save_data_to_s3(
# data=json.dumps(spatial_data_to_uprn.to_dict("records")),
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
# bucket_name="retrofit-data-dev"
# )
# We merge this spatial data onto final EPCS
def concatenate_row(row):
return ', '.join(row.dropna().replace('', None).dropna().astype(str))
def adjust_clusters(cluster_allocation, total_clusters):
current_total = sum(cluster_allocation.values())
adjustment = total_clusters - current_total
if adjustment > 0:
# Increase clusters, start from the largest group
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
cluster_allocation[group] += 1
adjustment -= 1
if adjustment == 0:
break
elif adjustment < 0:
# Decrease clusters, start from the largest group
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
cluster_allocation[group] -= 1
adjustment += 1
if adjustment == 0:
break
return cluster_allocation
def compile_data_final():
# Updated version:
"""
Various data sources have been produced to create the final data source for Stonewater.
This function combines them
:return:
"""
########################################################################
# Read in data
########################################################################
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
# Read in the lookups
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
)))
uprn_lookup_1["match_type"] = "Exact"
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
)))
uprn_lookup_2 = uprn_lookup_2.rename(
columns={
"epc_address": "standardised_address",
"epc_postcode": "standardised_postcode"
}
)
uprn_lookup_2["match_type"] = "EPC"
uprn_lookup_2["uprn"] = np.where(
uprn_lookup_2["internal_id"] == 1091,
83143766,
uprn_lookup_2["uprn"]
)
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
)))
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
concatenate_row, axis=1
)
uprn_lookup_3 = uprn_lookup_3[
["udprn", "uprn", "standardised_address", "postcode"]
].rename(columns={"postcode": "standardised_postcode"})
uprn_lookup_3["match_type"] = "Exact"
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
# prepare lookup 4
uprn_lookup_4 = []
for _, x in uprn_lookup_4_basis.iterrows():
property_type = None
built_form = None
if x["option"] == 1:
uprn = x["os_option_1_uprn"]
standardised_address = x["os_option_1_address"]
postcode = x["os_option_1_postcode"]
elif x["option"] == 2:
uprn = x["os_option_2_uprn"]
standardised_address = x["os_option_2_address"]
postcode = x["os_option_2_address"].split(", ")[-1]
else:
uprn = x["manual_uprn"]
standardised_address = x["manual_address"]
postcode = x["manual_postcode"]
uprn_lookup_4.append(
{
"internal_id": x["internal_id"],
"external_address_id": x["external_address_id"],
"uprn": uprn,
"standardised_address": standardised_address,
"standardised_postcode": postcode,
"property_type": property_type,
"built_form": built_form
}
)
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
uprn_lookup_4["match_type"] = "Fuzzy"
# concat
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
# We now merge all of the UPRNs onto the asset list
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/epc_data.json"
)
)
epc_data = pd.DataFrame(epc_data)
epc_data["uprn"] = np.where(
epc_data["internal_id"] == 1091,
83143766,
epc_data["uprn"]
)
# We drop come EPCS
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
# This we can use to produce additional variables such as number of old surveys
# older_epc_data = json.loads(
# read_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
# )
# )
# older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
########################################################################
# Prepare asset list
########################################################################
# Keep just the columns we're interested in
asset_list = asset_list[
[
"Osm. ID",
"Org. ref.",
"Postcode",
"House no",
"Name",
"Address line 2",
"City/Town",
"County",
"Address ID", # This is not uprn
"udprn",
"Owning body"
]
].rename(
columns={
"Osm. ID": "internal_id",
"Org. ref.": "customer_asset_id",
"Postcode": "postcode",
"House no": "house_number",
"Name": "address1",
"Address line 2": "address2",
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
"Owning body": "owner"
}
)
# Create full address
asset_list["full_address"] = np.where(
~pd.isnull(asset_list["address2"]),
(
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"].str.title() + ", " +
# asset_list["county"] + ", " +
asset_list["postcode"]
),
asset_list["address1"] + ", " +
asset_list["city_town"].str.title() + ", " +
# asset_list["county"] + ", " +
asset_list["postcode"]
)
if pd.isnull(asset_list["full_address"]).sum():
raise ValueError("Missing full addresses")
# Final preps of lookups
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
uprn_lookup_3 = uprn_lookup_3.merge(
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
)
uprn_lookup = pd.concat([
uprn_lookup,
uprn_lookup_3,
uprn_lookup_4
])
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
asset_list = asset_list.merge(
uprn_lookup.drop(columns=["udprn"]),
how="inner",
on=["internal_id", "external_address_id"]
)
# Store locally
# asset_list.to_excel("Stonewater asset list with uprn.xlsx")
# We take just domestic properties
# This is the first ordnance survey data pull
os_most_relevant_1 = []
os_all_1 = {}
for i in tqdm(["1", "2", "3"]):
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant_1.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
# This is the second ordnance survey data pull
os_most_relevant_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
)
os_most_relevant_2 = json.loads(os_most_relevant_2)
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
os_all_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
)
os_all_2 = json.loads(os_all_2)
needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])]
os_1_ids = os_most_relevant_1["internal_id"].values
os_2_ids = os_most_relevant_2["internal_id"].values
epc_data_batch_2 = []
older_epcs_batch_2 = {}
for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)):
if pd.isnull(property["uprn"]):
continue
searcher = SearchEpc(
address1=", ".join(property["standardised_address"].split(", ")[:-1]),
postcode=property["standardised_postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
full_address=property["standardised_address"],
uprn=property["uprn"]
)
searcher.find_property(skip_os=True)
if searcher.newest_epc is None and property["match_type"] == "Exact":
# Estimate!
# Get the OS data
p_os_df = pd.DataFrame()
if property["internal_id"] in os_1_ids:
p_os_df = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]]
)
p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]]
if p_os_df.empty:
p_os_df = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]]
)
p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
if not p_os_df.empty:
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
else:
searcher.ordnance_survey_client.property_type = ""
# Now we estimate
searcher.newest_epc = searcher.estimate_epc(
property_type=searcher.ordnance_survey_client.property_type,
built_form=searcher.ordnance_survey_client.built_form,
lmks_to_drop=None,
exclude_old=True
)
elif searcher.newest_epc is None and property["match_type"] == "Fuzzy":
if "flat" in property["standardised_address"].lower():
searcher.newest_epc = searcher.estimate_epc(
property_type="Flat",
built_form=None,
lmks_to_drop=None,
exclude_old=True
)
else:
searcher.newest_epc = searcher.estimate_epc(
property_type="House",
built_form=None,
lmks_to_drop=None,
exclude_old=True
)
epc_data_batch_2.append(
{
"internal_id": property["internal_id"],
**searcher.newest_epc
}
)
if searcher.older_epcs is not None:
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
# Store in S3
# TODO - read in instead of running
# save_pickle_to_s3(
# data=epc_data_batch_2,
# s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
# bucket_name="retrofit-data-dev"
# )
#
# save_pickle_to_s3(
# data=older_epcs_batch_2,
# s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
# bucket_name="retrofit-data-dev"
# )
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
# We now prepare the final data for clustering
uprn_filenames = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
)
uprn_map = {}
for uprn in complete_epcs["uprn"]:
filtered_df = uprn_filenames[
(uprn_filenames["lower"] <= int(uprn))
& (uprn_filenames["upper"] >= int(uprn))
]
if filtered_df["filenames"].values[0] in uprn_map:
uprn_map[filtered_df["filenames"].values[0]].append(int(uprn))
else:
uprn_map[filtered_df["filenames"].values[0]] = [int(uprn)]
spatial_data_to_uprn = []
for filename, associated_uprn in tqdm(uprn_map.items(), total=len(uprn_map)):
# Read in the file
spatial_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}"
)
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
spatial_data_to_uprn.append(spatial_df)
# TODO: Let's store this in s3
# save_pickle_to_s3(
# data=spatial_data_to_uprn,
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
# bucket_name="retrofit-data-dev"
# )
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
spatial_data_to_uprn = spatial_data_to_uprn.drop(
columns=["partition", "filename"]
).rename(columns={"UPRN": "uprn"})
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
property_attributes = complete_epcs.merge(
spatial_data_to_uprn,
how="inner",
on="uprn"
)
property_attributes = property_attributes.merge(
asset_list[["internal_id", "owner", "match_type"]], how="left", on="internal_id"
)
# TODO: Add on data from the asset list such as ownership
# We drop the columns we don't care about for clustering
property_attributes = property_attributes.drop(
columns=[
"address",
"uprn-source",
"heating-cost-potential",
"hot-water-cost-potential",
"potential-energy-rating",
"environment-impact-potential",
"address3",
"local-authority-label",
"sheating-energy-eff",
"local-authority-label",
"county",
"postcode",
"constituency",
"co2-emissions-potential",
"energy-consumption-potential",
"local-authority",
"inspection-date",
"address1",
"constituency-label",
"building-reference-number",
"floor-energy-eff",
"address2",
"posttown",
"floor-env-eff",
"sheating-env-eff",
"lighting-cost-potential",
"main-heating-controls",
"transaction-type",
"uprn",
"lodgement-date",
"lmk-key",
"wind-turbine-count",
"tenure",
"potential-energy-efficiency",
"glazed-area"
]
)
# Fields to transform: lodgement-datetime
property_attributes["days_since_last_epc"] = (
datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"], errors="coerce")
).dt.days
property_attributes = property_attributes.drop(columns=["lodgement-datetime"])
# Up to:
# Round averages to nearest integer
fill_with_average = [
"low-energy-fixed-light-count",
"floor-height",
"heating-cost-current",
"fixed-lighting-outlets-count",
"hot-water-cost-current",
"number-heated-rooms",
"co2-emiss-curr-per-floor-area",
"total-floor-area",
"environment-impact-current",
"co2-emissions-current",
"number-habitable-rooms",
"energy-consumption-current",
'lighting-cost-current',
"low-energy-lighting",
]
fill_with_mode = [
"multi-glaze-proportion",
"extension-count",
]
fill_with_zero = [
"unheated-corridor-length",
"number-open-fireplaces",
"photo-supply",
]
fill_with_categorical = {
"construction-age-band": "unknown",
"mainheat-energy-eff": "N/A",
"windows-env-eff": "N/A",
"lighting-energy-eff": "N/A",
"energy-tariff": 'NO DATA!',
"mechanical-ventilation": 'NO DATA!',
"solar-water-heating-flag": "N",
"mains-gas-flag": "N",
"heat-loss-corridor": "unknown",
"flat-storey-count": "Not a flat",
"roof-energy-eff": "N/A",
"hot-water-env-eff": "N/A",
"mainheatc-energy-eff": "N/A",
"main-fuel": 'NO DATA!',
"lighting-env-eff": "N/A",
"windows-energy-eff": "N/A",
"roof-env-eff": "N/A",
"walls-env-eff": "N/A",
"mainheat-env-eff": "N/A",
"flat-top-storey": "N",
"mainheatc-env-eff": "N",
"floor-level": "NODATA!",
"hot-water-energy-eff": "N/A",
"glazed-type": "unknown"
}
# Consolidation columns to single value
consolidation_columns = {
"glazed-type": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
"mechanical-ventilation": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
"solar-water-heating-flag": {"from": [''], "to": "N"},
"mains-gas-flag": {"from": [''], "to": "N"},
"heat-loss-corridor": {"from": ['NO DATA!', ''], "to": "N"},
"flat-top-storey": {"from": [''], "to": "N"},
"floor-level": {"from": [""], "to": "NODATA!"}
}
# Perform the cleaning
for col in fill_with_average:
property_attributes[col] = property_attributes[col].replace('', None)
avg_val = np.mean([float(x) for x in property_attributes[col].values if x not in [None, "", np.nan]])
if pd.isnull(avg_val):
raise Exception("something went wrong")
property_attributes[col] = property_attributes[col].fillna(round(avg_val))
property_attributes[col] = property_attributes[col].astype(float)
for c in fill_with_zero:
property_attributes[c] = property_attributes[c].replace('', 0)
property_attributes[c] = property_attributes[c].fillna(0)
property_attributes[c] = property_attributes[c].astype(float)
for col in fill_with_mode:
property_attributes[col] = property_attributes[col].replace('', None)
mode_val = stats.mode([float(x) for x in property_attributes[col].values if x not in [None, "", np.nan]])[0]
if pd.isnull(mode_val):
raise Exception("something went wrong")
property_attributes[col] = property_attributes[col].fillna(mode_val)
property_attributes[col] = property_attributes[col].astype(float)
for c, fill_val in fill_with_categorical.items():
property_attributes[c] = property_attributes[c].replace('', fill_val)
property_attributes[c] = property_attributes[c].fillna(fill_val)
# Finally, consolidate
for c, consolidate_config in consolidation_columns.items():
for v in consolidate_config["from"]:
property_attributes[c] = property_attributes[c].replace(v, consolidate_config["to"])
property_attributes["estimated"] = property_attributes["estimated"].fillna(False)
property_attributes["conservation_status"] = property_attributes["conservation_status"].fillna(False)
property_attributes["days_since_last_epc"] = property_attributes["days_since_last_epc"].fillna(
property_attributes["days_since_last_epc"].mean()
)
missings = pd.isnull(property_attributes).sum()
missings = missings[missings > 0]
# Save this
# save_pickle_to_s3(
# data=property_attributes,
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
# )
# from utils.s3 import read_pickle_from_s3
# property_attributes = read_pickle_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
# )
# We perform some additional cleaning on the data
import msgpack
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
cleaners = {
"floor-description": FloorAttributes,
'hotwater-description': HotWaterAttributes,
'main-fuel': MainFuelAttributes,
'mainheat-description': MainHeatAttributes,
'mainheatcont-description': MainheatControlAttributes,
'roof-description': RoofAttributes,
'walls-description': WallAttributes,
'windows-description': WindowAttributes,
'lighting-description': LightingAttributes
}
for variable_to_clean in cleaned.keys():
unique_descriptions = property_attributes[variable_to_clean].unique()
clean_df = pd.DataFrame(cleaned[variable_to_clean])
# Check if we have any
missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
if missed:
descriptions_to_append = []
for description in missed:
if variable_to_clean == "lighting-description":
cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
else:
cln = cleaners[variable_to_clean](description)
to_append = {
"original_description": description,
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
**cln.process()
}
descriptions_to_append.append(to_append)
descriptions_to_append = pd.DataFrame(descriptions_to_append)
clean_df = pd.concat([clean_df, descriptions_to_append])
clean_df = clean_df.rename(
columns={
"thermal_transmittance": f"{variable_to_clean}_thermal_transmittance",
"is_assumed": f"{variable_to_clean}_is_assumed",
}
)
if 'thermal_transmittance_unit' in clean_df.columns:
clean_df = clean_df.drop(columns=['thermal_transmittance_unit'])
starting_size = len(property_attributes)
property_attributes = property_attributes.merge(
clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
)
if starting_size != property_attributes.shape[0]:
raise Exception("something went wrong")
property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
# Fill missings
for k in clean_df.columns:
if k in property_attributes.columns:
property_attributes[k] = property_attributes[k].fillna("missing")
# We group some variables such as thermal transmittance for walls, roof, floors
# ranges = {
# "< 0.1": (0, 0.1),
# "0.1 - 0.3": (0.1, 0.3),
# "0.3 - 0.5": (0.3, 0.5),
# "0.5 - 0.7": (0.5, 0.7),
# "0.9 - 1": (0.9, 1),
# "1 - 1.5": (1, 1.5),
# "1.5 - 2": (1.5, 2),
# "2+": (2, 2.5)
# }
ranges = {
"< 0.1": (0, 0.1),
"0.1 - 0.3": (0.1, 0.3),
"0.3 - 0.5": (0.3, 0.5),
"0.5+": (0.5, 2.5),
}
# Generate the lookup table
thermal_transmittance_lookup_table = []
for i in range(1, 251):
value = i / 100
for label, (low, high) in ranges.items():
if low < value <= high:
thermal_transmittance_lookup_table.append({"from": value, "to": label})
break
# Convert to DataFrame for display
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
thermal_transmittance_cols = [
c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c
]
for i, col in enumerate(thermal_transmittance_cols):
# Perform the mapping
to_col = f"to_{col}"
property_attributes[col] = property_attributes[col].astype(str)
property_attributes = property_attributes.merge(
thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
how="left",
left_on=col,
right_on="from",
suffixes=("", f"_{i}")
)
property_attributes = property_attributes.drop(columns=["from", col])
property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
# Drop the description columns that are the keys in cleaned
print("PUT ME BACK!!??")
property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
# Perform the mapping
# CLUSTERING!!
grouping_columns = [
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
]
additional_features = [
]
# Define the preprocessing for numerical and categorical features
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
for col in categorical_features:
property_attributes[col] = property_attributes[col].astype(str)
id_column = 'internal_id'
n_clusters = 450
random_state = 0
training_data_grouped = property_attributes.groupby(grouping_columns)
group_sizes = {name: len(group) for name, group in training_data_grouped}
total_size = sum(group_sizes.values())
cluster_allocation = {
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
}
# Adjust cluster allocation to ensure total clusters sum to 450
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
# TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to
# collect the results of the clustering and then perform the transformations afterwards
final_clusters = []
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
group_n_clusters = cluster_allocation[group_variables]
group_data.set_index(id_column, inplace=True)
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)
]
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
# Fit the pipeline to the data
pipeline.fit(group_data)
# Transform the data using the fitted pipeline
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
# Get cluster labels
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
# Get centroids (already in the same transformed space)
centroids = pipeline.named_steps['kmeans'].cluster_centers_
# if the data isn't an array, make it one
if not isinstance(processed_data, np.ndarray):
processed_data = processed_data.toarray()
# Calculate distances from each point to the centroid of its cluster
distances_to_centroids = [
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
for i, label in enumerate(group_data['cluster'])
]
group_data['distance_to_centroid'] = distances_to_centroids
# for cluster_id in group_data['cluster'].unique():
# cluster_data = group_data[group_data['cluster'] == cluster_id]
# min_distance = cluster_data['distance_to_centroid'].min()
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
# if min_distance != 0:
# print(f"No point with zero distance found in cluster {cluster_id}")
# Ranking rows by distance within each cluster
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
# Sorting to verify
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
group_data.reset_index(inplace=True)
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
final_clusters.append(to_append)
final_clusters = pd.concat(final_clusters)
# remap the clusters from the current names to 1 -> n_clusters
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
################################################
# Prepare outputs!!!!
################################################
property_attributes.reset_index(inplace=True)
property_attributes = property_attributes.merge(
final_clusters, how="left", on="internal_id"
)
property_attributes["archetype_representative"] = property_attributes["rank"] == 1
asset_list_with_archetypes = asset_list.merge(
property_attributes[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
on="internal_id"
)
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].fillna(-999)
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].astype(int).astype(str)
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].replace("-999", "NO ARCHETYPE")
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
"archetype_representative"].fillna(False)
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False)
stonewater_uprn_lookup = asset_list_with_archetypes[
["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
]
stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
def pull_ideal_postcodes(missing_uprn_with_udprn):
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
import requests
import time
completed_id = 0
uprn_to_udprn = []
for row_index, data in tqdm(missing_uprn_with_udprn.iterrows(), total=len(missing_uprn_with_udprn)):
if row_index < completed_id:
continue
time.sleep(0.5)
# Call the API
udprn = data["udprn"]
url = f"https://api.ideal-postcodes.co.uk/v1/udprn/{udprn}?api_key={api_key}"
payload = {
"api_key": api_key
}
headers = {
'Accept': 'application/json'
}
response = requests.request("GET", url, headers=headers, data=payload)
if response.status_code != 200:
raise ValueError("API call dead")
result = response.json()
uprn_to_udprn.append(
result["result"]
)
completed_id += 1
def updated_version():
"""
This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
using fewer variables and also factoring in their internal data sources
This work began on the 23rd July 2024
:return:
"""
########################################################################
# Read in data
########################################################################
asset_list = read_asset_list()
asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list)
# Read in the properties that have been included in Osmosis' wave 2.1
osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)
# We also check the address & postcode
asset_list["In Osmosis Wave 2.1"] = np.where(
asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
True,
asset_list["In Osmosis Wave 2.1"]
)
priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()
# Pull in the EPC data
epc_data = read_epc_data(uprn_lookup_2)
# Pull in the spatial data to UPRN
spatial_data_to_uprn = read_pickle_from_s3(
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
bucket_name="retrofit-data-dev"
)
# Function to convert specific columns to bool dtype
def convert_specific_columns_to_bool(df, columns):
for column in columns:
if column in df.columns:
df[column] = df[column].astype(bool)
return df
spatial_data_to_uprn = [convert_specific_columns_to_bool(
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
) for df in spatial_data_to_uprn]
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
spatial_data_to_uprn = spatial_data_to_uprn.drop(
columns=["partition", "filename"]
).rename(columns={"UPRN": "uprn"})
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
########################################################################
# Prepare the data
########################################################################
# Filter the asset list down to the priority postcodes
asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)
master_sheet = master_sheet[
master_sheet["Address ID"].isin(
asset_list["external_address_id"].values
)
]
master_sheet["days_since_lodgement"] = (
datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
).dt.days
asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
how="left",
left_on="external_address_id",
right_on="Address ID"
)
asset_list = asset_list.merge(
epc_data[["internal_id", "current-energy-efficiency", "lodgement-date", "estimated"]],
how="left",
on="internal_id"
)
asset_list["days_since_lodgement_epc"] = (
datetime.now() - pd.to_datetime(asset_list["lodgement-date"], errors="coerce", dayfirst=True)
).dt.days
# Flag properties that were surveyed within the last 5 years
asset_list["epc_within_5_years"] = asset_list["days_since_lodgement_epc"] < 5 * 365
# Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
# a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
# the EPC is done
asset_list["is_epc_c_or_above"] = (
((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
(asset_list["EPC Rating"] >= 80)
)
clustering_features = asset_list[
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
~pd.isnull(asset_list["uprn"])
][
[
"internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
"city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
"epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
]
]
# Merge on the SAP data
clustering_features = clustering_features.merge(
master_sheet[
["Address ID", "SAP"]
].rename(columns={"SAP": "parity_modelled_sap"}),
how="left",
left_on="external_address_id",
right_on="Address ID"
)
# For SAP, we use the most recent EPC if epc_within_5_years is True, otherwise we use the parity modelled sap
clustering_features["current-energy-efficiency"] = clustering_features["current-energy-efficiency"].astype(float)
clustering_features["representative_sap"] = np.where(
clustering_features["epc_within_5_years"],
clustering_features["current-energy-efficiency"],
clustering_features["parity_modelled_sap"]
)
# We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
# is too many
clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
# Merge on spatial features
clustering_features = clustering_features.merge(
spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
how="left",
on="uprn"
)
# incorect_epcs = clustering_features[
# clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
# incorect_epcs = incorect_epcs[
# ~pd.isnull(incorect_epcs["current-energy-efficiency"]) & pd.isnull(incorect_epcs["estimated"])
# ]
# incorect_epcs = incorect_epcs.rename(columns={"current-energy-efficiency": "Current SAP Rating"})
# # Store data
# incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
# We add in the key features, which are used for clustering
master_sheet_clustering_features = master_sheet[
["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
].copy()
# Step 1: Remap walls - we end up with 11 types
master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
{
"TimberFrame: AsBuilt": "Other wall type, as built",
"SystemBuilt: AsBuilt": "Other wall type, as built",
"Sandstone: AsBuilt": "Other wall type, as built",
"Sandstone: Internal": "Other wall type, internal or external",
"SystemBuilt: External": "Other wall type, internal or external",
"GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
"TimberFrame: Internal": "Other wall type, internal or external",
"Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
"SystemBuilt: Internal": "Other wall type, internal or external",
"Cavity: Internal": "Other wall type, internal or external",
}
)
# Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
# gives us the insulation thickness
# Clean an incorrect value
master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
{
"PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
"PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
}
)
master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
master_sheet_clustering_features['Roofs'].apply(
lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
)
)
# Strip any extra whitespace
master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
master_sheet_clustering_features['roof_insulation_thickness'] = (
master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
)
def map_thickness(thickness):
try:
value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
return "Above 250mm" if value > 250 else "Below 250mm"
except ValueError:
return thickness # Return the original value if it cannot be converted to a float
master_sheet_clustering_features['roof_insulation_category'] = (
master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
)
# Ideas
# 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
# as a secondary category
# 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
# (could split on :, take first part)
clustering_features = clustering_features.merge(
master_sheet_clustering_features,
how="left",
on="Address ID"
)
# Reduce down to the final set of features we need
clustering_features = clustering_features[
[
"internal_id",
"Property Type",
# Location
"postal_region",
'conservation_status',
'is_listed_building',
'is_heritage_building',
"county",
# Walls
"walls_reduced",
# Roof
"roof_type",
"roof_insulation_category",
# Heating
"Heating",
# Fuel
"Main Fuel",
"Age",
"Total Floor Area",
"representative_sap",
"days_since_lodgement",
]
]
clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
def split_property_type(row):
parts = row.split(':')
property_type = parts[0].strip()
built_form = parts[1].strip() if len(parts) > 1 else ''
property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
return pd.Series([property_type, built_form, property_extended_feature])
clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
clustering_features['Property Type'].apply(split_property_type)
)
clustering_features = clustering_features.drop(columns=["Property Type"])
# These are the variables we MUST split by
grouping_columns = [
"property_type",
"walls_reduced",
"roof_type",
"Main Fuel",
"county",
]
def combine_small_groups(clustering_features, grouping_columns, threshold=2):
# Identify small groups
group_sizes = clustering_features.groupby(grouping_columns).size()
small_groups = group_sizes[group_sizes <= threshold].index.tolist()
# Remove small groups from the original clustering_features
small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
clustering_features_ok = clustering_features[
~clustering_features.set_index(grouping_columns).index.isin(small_groups)
]
if small_group_data.empty:
return clustering_features
# One-Hot Encode categorical variables
categorical_features = (
clustering_features_ok.drop(columns=["internal_id"])
.select_dtypes(include=['object', 'category']).columns.tolist()
)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(clustering_features_ok[categorical_features])
# Combine small groups with the nearest available group
small_group_ohe = ohe.transform(small_group_data[categorical_features])
large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])
numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
small_group_numerical = small_group_data[numerical_features].values
large_group_numerical = clustering_features_ok[numerical_features].values
# Concatenate one-hot encoded categorical and numerical features
small_group_features = np.hstack([small_group_ohe, small_group_numerical])
large_group_features = np.hstack([large_group_ohe, large_group_numerical])
# Calculate distances and find nearest groups
closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
closest_group_index = clustering_features_ok.iloc[closest_groups].index
# Update small groups to the nearest large group
for small_group, closest_group in zip(small_groups, closest_group_index):
small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
closest_group, grouping_columns].values
combined_data = pd.concat([clustering_features_ok, small_group_data])
return combined_data
clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
########################################################################
# Clustering
########################################################################
numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
for col in categorical_features:
clustering_features_combined[col] = clustering_features_combined[col].astype(str)
id_column = 'internal_id'
n_clusters = 450
random_state = 0
training_data_grouped = clustering_features_combined.groupby(grouping_columns)
group_sizes = {name: len(group) for name, group in training_data_grouped}
total_size = sum(group_sizes.values())
cluster_allocation = {
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
}
# Adjust cluster allocation to ensure total clusters sum to 450
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
final_clusters = []
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
group_n_clusters = cluster_allocation[group_variables]
group_data.set_index(id_column, inplace=True)
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)
]
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
# Fit the pipeline to the data
pipeline.fit(group_data)
# Transform the data using the fitted pipeline
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
# Get cluster labels
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
# Get centroids (already in the same transformed space)
centroids = pipeline.named_steps['kmeans'].cluster_centers_
# if the data isn't an array, make it one
if not isinstance(processed_data, np.ndarray):
processed_data = processed_data.toarray()
# Calculate distances from each point to the centroid of its cluster
distances_to_centroids = [
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
for i, label in enumerate(group_data['cluster'])
]
group_data['distance_to_centroid'] = distances_to_centroids
# Ranking rows by distance within each cluster
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
# Sorting to verify
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
group_data.reset_index(inplace=True)
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
final_clusters.append(to_append)
final_clusters = pd.concat(final_clusters)
# remap the clusters from the current names to 1 -> n_clusters
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
assigned_clusters = clustering_features_combined.merge(
final_clusters, how="left", on="internal_id"
)
assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
asset_list_with_archetypes = asset_list.merge(
assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
on="internal_id"
).merge(
master_sheet_clustering_features[["Address ID", "Property Type", "Walls", "Roofs", "Heating"]],
how="left",
on="Address ID"
)
# We populate the reasons for no archetype
# 1) If it's not a priority postcode
asset_list_with_archetypes["cluster"] = np.where(
~asset_list_with_archetypes["is_priority_postcode"],
"NOT PRIORITY POSTCODE",
asset_list_with_archetypes["cluster"]
)
# 2) If it's EPC C or above
asset_list_with_archetypes["cluster"] = np.where(
asset_list_with_archetypes["is_epc_c_or_above"],
"EPC C OR ABOVE",
asset_list_with_archetypes["cluster"]
)
# If it's in Wave 2.1
asset_list_with_archetypes["cluster"] = np.where(
asset_list_with_archetypes["In Osmosis Wave 2.1"],
"IN WAVE 2.1",
asset_list_with_archetypes["cluster"]
)
# Has missing uprn
asset_list_with_archetypes["cluster"] = np.where(
pd.isnull(asset_list_with_archetypes["uprn"]),
"MISSING UPRN",
asset_list_with_archetypes["cluster"]
)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
asset_list_with_archetypes["archetype_representative"] = (
asset_list_with_archetypes["archetype_representative"].fillna(False)
)
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.1.csv", index=False)
# Produce the archetyping features
archetyping_features_csv = assigned_clusters[
[
"internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
"is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
]
].merge(
asset_list[
["internal_id", "uprn", "external_address_id"]
],
how="left",
on="internal_id"
).merge(
master_sheet_clustering_features,
how="left",
right_on="Address ID",
left_on="external_address_id"
).drop(columns=["Address ID"]).rename(
columns={
"internal_id": "Osm. ID",
"external_address_id": "Address ID",
}
)
archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
archetyping_features_csv.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
)
representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
print(representatives["postal_region"].nunique())
print(representatives["county"].nunique())
def read_asset_list():
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
asset_list = asset_list.rename(
columns={
"Osm. ID": "internal_id",
"Org. ref.": "customer_asset_id",
"Postcode": "postcode",
"House no": "house_number",
"Name": "address1",
"Address line 2": "address2",
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
"Owning body": "owner"
}
)
asset_list["full_address"] = np.where(
~pd.isnull(asset_list["address2"]),
(
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"].str.title() + ", " +
asset_list["postcode"]
),
asset_list["address1"] + ", " +
asset_list["city_town"].str.title() + ", " +
asset_list["postcode"]
)
return asset_list
def merge_uprn_to_asset_list(asset_list):
# Read in the lookups
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
)))
uprn_lookup_1["match_type"] = "Exact"
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
)))
uprn_lookup_2 = uprn_lookup_2.rename(
columns={
"epc_address": "standardised_address",
"epc_postcode": "standardised_postcode"
}
)
uprn_lookup_2["match_type"] = "EPC"
uprn_lookup_2["uprn"] = np.where(
uprn_lookup_2["internal_id"] == 1091,
83143766,
uprn_lookup_2["uprn"]
)
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
)))
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
concatenate_row, axis=1
)
uprn_lookup_3 = uprn_lookup_3[
["udprn", "uprn", "standardised_address", "postcode"]
].rename(columns={"postcode": "standardised_postcode"})
uprn_lookup_3["match_type"] = "Exact"
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
# prepare lookup 4
uprn_lookup_4 = []
for _, x in uprn_lookup_4_basis.iterrows():
property_type = None
built_form = None
if x["option"] == 1:
uprn = x["os_option_1_uprn"]
standardised_address = x["os_option_1_address"]
postcode = x["os_option_1_postcode"]
elif x["option"] == 2:
uprn = x["os_option_2_uprn"]
standardised_address = x["os_option_2_address"]
postcode = x["os_option_2_address"].split(", ")[-1]
else:
uprn = x["manual_uprn"]
standardised_address = x["manual_address"]
postcode = x["manual_postcode"]
uprn_lookup_4.append(
{
"internal_id": x["internal_id"],
"external_address_id": x["external_address_id"],
"uprn": uprn,
"standardised_address": standardised_address,
"standardised_postcode": postcode,
"property_type": property_type,
"built_form": built_form
}
)
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
uprn_lookup_4["match_type"] = "Fuzzy"
# concat
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
# Final preps of lookups
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
uprn_lookup_3 = uprn_lookup_3.merge(
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
)
uprn_lookup = pd.concat([
uprn_lookup,
uprn_lookup_3,
uprn_lookup_4
])
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
asset_list = asset_list.merge(
uprn_lookup.drop(columns=["udprn"]),
how="inner",
on=["internal_id", "external_address_id"]
)
return asset_list, uprn_lookup_2
def read_omosis_wave_2_1():
osmosis_wave_2_1 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
header=4,
)
# Remove double spaces from "Name"
osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace(" ", " ")
osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
# We produce a cleaned list of asset ids from osmosis_wave_2_1
osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
# We have some ids that are in the form 'id1, id2' so we split them
osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]
return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1
def read_stonewater_asset_data():
master_sheet = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
"sheet.csv",
encoding='latin1'
)
master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)
previous_waves = master_sheet[
(master_sheet["In Osmosis W2.1"] == "Yes") |
(master_sheet["In Wates Wave 2.1"] == "Yes") |
(master_sheet["In Liv Green Wave 2.1"] == "Yes") |
(master_sheet["In CCS Wave 2.1"] == "Yes")
].copy()
previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]
# We also read the priority postcodes
priority_postcodes = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
"postcodes.csv",
header=17
)
priority_postcodes = priority_postcodes["Postcode"].tolist()
return priority_postcodes, previous_waves_address_id, master_sheet
def read_epc_data(uprn_lookup_2):
epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/epc_data.json"
)
)
epc_data = pd.DataFrame(epc_data)
epc_data["uprn"] = np.where(
epc_data["internal_id"] == 1091,
83143766,
epc_data["uprn"]
)
# We drop come EPCS
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
epc_data_batch_2 = read_pickle_from_s3(
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
return complete_epcs