mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
2671 lines
101 KiB
Python
2671 lines
101 KiB
Python
import json
|
|
from tqdm import tqdm
|
|
import os
|
|
from dotenv import load_dotenv
|
|
from backend.SearchEpc import SearchEpc
|
|
import urllib.parse
|
|
import requests
|
|
from datetime import datetime
|
|
from scipy import stats
|
|
|
|
from fuzzywuzzy import fuzz
|
|
import numpy as np
|
|
import pandas as pd
|
|
import time
|
|
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
|
|
save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.pipeline import Pipeline
|
|
from scipy.spatial.distance import cdist
|
|
from sklearn.metrics import pairwise_distances_argmin_min
|
|
|
|
load_dotenv(dotenv_path="backend/.env")
|
|
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
|
|
|
# We create a MAP of uprns, for EPCs that didn't give use the UPRN
|
|
missing_uprn_map = [
|
|
# This is a map from internal_id to UPRN, for properties where we do have an EPC, but we don't have
|
|
# a uprn
|
|
# 1 Church Street, Alfreton, DE55 7AH
|
|
{"internal_id": 78, "mapped_uprn": None}, # Doesn't seem to exist any more
|
|
# 1 Granville Road, Luton, LU1 1PA
|
|
{"internal_id": 315, "mapped_uprn": 100080148856},
|
|
# 11 College Street, Birstall, Batley, WF17 9HF
|
|
# The EPC record is for 11 and 11a
|
|
{"internal_id": 1090, "mapped_uprn": 83190440},
|
|
# 11a College Street, Birstall, Batley, WF17 9HF
|
|
{"internal_id": 1092, "mapped_uprn": 83143766},
|
|
# Flat 5 Friars Street, Hereford, HR4 0AS
|
|
# TODO: Check this
|
|
{"internal_id": 1384, "mapped_uprn": 200002600892},
|
|
# This UPRN is for 5 Friars Court, which is a flat
|
|
# Flat 7 Friars Street, Hereford, HR4 0AS
|
|
# TODO: Check this
|
|
{"internal_id": 1385, "mapped_uprn": 200002600894},
|
|
# This UPRN is for 7 Friars Court, which is a flat
|
|
# 1 Waverley Street, Dudley, DY2 0YE
|
|
{"internal_id": 3349, "mapped_uprn": 90022438},
|
|
# 5 Brighton Road, Burgh Heath, Tadworth, KT20 6BQ
|
|
# TODO: Check this
|
|
# This UPRN is for 5 Copthorne, Brighton Road, Burgh Heath, KT20 6BQ, which is a flat
|
|
{"internal_id": 5027, "mapped_uprn": 100062145273},
|
|
# Room 1, 21 Coxford Road, Southampton, SO16 5FG
|
|
# This is for 21 Coxford Road
|
|
{"internal_id": 5554, "mapped_uprn": 100060692392},
|
|
|
|
]
|
|
missing_uprn_map = pd.DataFrame(missing_uprn_map)
|
|
|
|
internal_id_epcs_to_drop = [315, 1384, 1385, 3349]
|
|
|
|
|
|
def remove_commas_and_full_stops(input_string: str) -> str:
|
|
"""
|
|
Removes commas and full stops from the input string.
|
|
|
|
Args:
|
|
input_string (str): The string from which to remove commas and full stops.
|
|
|
|
Returns:
|
|
str: The string with commas and full stops removed.
|
|
"""
|
|
return input_string.replace(',', '').replace('.', '')
|
|
|
|
|
|
def get_places_with_retry(searcher, max_retries=5, wait_time=2):
|
|
"""
|
|
Tries to call the get_places_api method up to max_retries times,
|
|
with a wait_time interval between attempts in case of failure.
|
|
|
|
Args:
|
|
searcher (object): The searcher object with the ordnance_survey_client.
|
|
max_retries (int): Maximum number of retry attempts.
|
|
wait_time (int): Wait time in seconds between retries.
|
|
|
|
Returns:
|
|
result: The result from the get_places_api method or None if all attempts fail.
|
|
"""
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = searcher.ordnance_survey_client.get_places_api()
|
|
status = response.get("status")
|
|
if status == 200:
|
|
return response # Return the result if successful
|
|
else:
|
|
print(f"Attempt {attempt + 1} failed with status code: {status}")
|
|
except Exception as e:
|
|
print(f"Attempt {attempt + 1} failed with error: {e}")
|
|
|
|
if attempt < max_retries - 1:
|
|
print(f"Retrying in {wait_time} seconds...")
|
|
time.sleep(wait_time)
|
|
|
|
print(f"All {max_retries} attempts failed.")
|
|
return None
|
|
|
|
|
|
def app():
|
|
"""
|
|
This script handles the preparation of the data from Stonewater, to archetype a collection
|
|
of 5.3k properties and reduce that down to a representative set of 450 properties.
|
|
|
|
Here, we prepare the input data for clustering
|
|
:return:
|
|
"""
|
|
|
|
# TODO: Temp read from local machine - move to s3
|
|
# asset_list = pd.read_excel(
|
|
# "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
|
|
# )
|
|
|
|
asset_list = read_excel_from_s3(
|
|
file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
|
bucket_name="retrofit-data-dev",
|
|
header_row=4
|
|
)
|
|
|
|
# Drop the bottom 4 rows, which are completely missing
|
|
asset_list = asset_list.head(-4)
|
|
|
|
# Keep just the columns we're interested in
|
|
asset_list = asset_list[
|
|
[
|
|
"Osm. ID",
|
|
"Org. ref.",
|
|
"Postcode",
|
|
"House no",
|
|
"Name",
|
|
"Address line 2",
|
|
"City/Town",
|
|
"County",
|
|
"Address ID", # This is not uprn
|
|
]
|
|
].rename(
|
|
columns={
|
|
"Osm. ID": "internal_id",
|
|
"Org. ref.": "customer_asset_id",
|
|
"Postcode": "postcode",
|
|
"House no": "house_number",
|
|
"Name": "address1",
|
|
"Address line 2": "address2",
|
|
"City/Town": "city_town",
|
|
"County": "county",
|
|
"Address ID": "external_address_id",
|
|
}
|
|
)
|
|
|
|
# Create full address
|
|
asset_list["full_address"] = np.where(
|
|
~pd.isnull(asset_list["address2"]),
|
|
(
|
|
asset_list["address1"] + ", " +
|
|
asset_list["address2"] + ", " +
|
|
asset_list["city_town"].str.title() + ", " +
|
|
# asset_list["county"] + ", " +
|
|
asset_list["postcode"]
|
|
),
|
|
asset_list["address1"] + ", " +
|
|
asset_list["city_town"].str.title() + ", " +
|
|
# asset_list["county"] + ", " +
|
|
asset_list["postcode"]
|
|
)
|
|
|
|
if pd.isnull(asset_list["full_address"]).sum():
|
|
raise ValueError("Missing full addresses")
|
|
|
|
# Pull in the data
|
|
# This data has already been pulled as much as it can be, so we retrieve the existing extraction from S3
|
|
|
|
# Perform an initial pull without ordnance survey data
|
|
# epc_data = []
|
|
# older_epc_data = {}
|
|
#
|
|
# for row_number, asset in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
|
# searcher = SearchEpc(
|
|
# address1=str(asset["address1"]),
|
|
# postcode=str(asset["postcode"]),
|
|
# auth_token=EPC_AUTH_TOKEN,
|
|
# os_api_key="",
|
|
# full_address=str(asset["full_address"]),
|
|
# uprn=asset.get("uprn", None),
|
|
# )
|
|
# searcher.find_property(skip_os=True)
|
|
#
|
|
# if searcher.newest_epc is None:
|
|
# continue
|
|
#
|
|
# epc_data.append(
|
|
# {
|
|
# "internal_id": asset["internal_id"],
|
|
# **searcher.newest_epc
|
|
# }
|
|
# )
|
|
#
|
|
# if searcher.older_epcs is not None:
|
|
# older_epc_data[asset["internal_id"]] = searcher.older_epcs
|
|
#
|
|
# # Store to S3
|
|
# save_data_to_s3(
|
|
# data=json.dumps(epc_data),
|
|
# s3_file_name="customers/Stonewater/clustering/epc_data.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
#
|
|
# save_data_to_s3(
|
|
# data=json.dumps(older_epc_data),
|
|
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
# We read this directly from s3
|
|
epc_data = json.loads(
|
|
read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/epc_data.json"
|
|
)
|
|
)
|
|
|
|
older_epc_data = json.loads(
|
|
read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
|
|
)
|
|
)
|
|
|
|
# Perform a comparison between the EPC address and the asset list address, just to double check
|
|
|
|
epc_data_df = pd.DataFrame(epc_data)
|
|
address_comparison = (
|
|
asset_list[["internal_id", "full_address", "postcode", "house_number", "address1"]].merge(
|
|
epc_data_df[["internal_id", "address", "postcode", "address1"]].rename(
|
|
columns={
|
|
"address": "epc_address",
|
|
"postcode": "epc_postcode",
|
|
"address1": "epc_address1"
|
|
}
|
|
),
|
|
how="inner",
|
|
on="internal_id"
|
|
)
|
|
)
|
|
|
|
# Produce a metric, showing the matching confidence between the two
|
|
address_comparison["epc_extracted_house_number"] = address_comparison["epc_address1"].apply(
|
|
lambda x: SearchEpc.get_house_number(x)
|
|
)
|
|
|
|
address_comparison["house_numbers_match"] = (
|
|
address_comparison["house_number"].str.lower() == address_comparison["epc_extracted_house_number"].str.lower()
|
|
)
|
|
|
|
# We also produce a address similarity metric
|
|
# We convert the strings to lower and remove common punctuation
|
|
|
|
address_comparison["address_similarity_score"] = address_comparison.apply(
|
|
lambda x: fuzz.ratio(
|
|
remove_commas_and_full_stops(x["address1"].lower()),
|
|
remove_commas_and_full_stops(x["epc_address1"].lower())
|
|
),
|
|
axis=1
|
|
)
|
|
|
|
address_comparison = address_comparison.sort_values("address_similarity_score", ascending=True)
|
|
address_comparison = address_comparison[
|
|
["internal_id", "full_address", "epc_address", "address_similarity_score", "house_numbers_match"]
|
|
]
|
|
|
|
# Anything with less than a 90 similarity score, let's do again
|
|
needs_ordnance_survey = address_comparison[
|
|
(address_comparison["address_similarity_score"] <= 90) |
|
|
(~address_comparison["house_numbers_match"])
|
|
].copy()
|
|
|
|
is_ok = address_comparison[~address_comparison["internal_id"].isin(needs_ordnance_survey["internal_id"])]
|
|
is_ok = is_ok.sort_values("address_similarity_score", ascending=True)
|
|
|
|
os_data_pull_asset_list = asset_list[
|
|
~asset_list["internal_id"].isin(is_ok["internal_id"].values)
|
|
].copy()
|
|
|
|
# We have already done a partial pull of the Ordnance survey data so we can skip some of the records
|
|
# os_most_relevant_1 = json.loads(
|
|
# read_from_s3(
|
|
# bucket_name="retrofit-data-dev",
|
|
# s3_file_name="customers/Stonewater/clustering/os_most_relevant_1.json"
|
|
# )
|
|
# )
|
|
#
|
|
# os_most_relevant_2 = json.loads(
|
|
# read_from_s3(
|
|
# bucket_name="retrofit-data-dev",
|
|
# s3_file_name="customers/Stonewater/clustering/os_most_relevant_2.json"
|
|
# )
|
|
# )
|
|
#
|
|
# fetched_internal_ids = (
|
|
# [x["internal_id"] for x in os_most_relevant_1] + [x["internal_id"] for x in os_most_relevant_2]
|
|
# )
|
|
#
|
|
# # We remove any ids we've already fetched
|
|
# os_data_pull_asset_list = os_data_pull_asset_list[
|
|
# ~os_data_pull_asset_list["internal_id"].isin(fetched_internal_ids)
|
|
# ]
|
|
#
|
|
# # Our OK EPC data (is_ok) + ordnance survey fetched data + the data we need to fetch should equal the total
|
|
# # number of assets
|
|
# assert len(is_ok) + len(fetched_internal_ids) + len(os_data_pull_asset_list) == len(asset_list)
|
|
|
|
os_data_pull_asset_list = os_data_pull_asset_list.reset_index(drop=True)
|
|
|
|
# For each of these records, we pull the OS data
|
|
# ORDNANCE_SURVEY_API_KEY = "" # This API key is a temp key which I have copied locally
|
|
# os_most_relevant = []
|
|
# os_all = {}
|
|
# errors = []
|
|
# for _, asset in tqdm(os_data_pull_asset_list.iterrows(), total=len(os_data_pull_asset_list)):
|
|
# # Calls are throttled to 50 per minute in development mode, so lets just slow this down
|
|
# time.sleep(2)
|
|
#
|
|
# searcher = SearchEpc(
|
|
# address1=str(asset["address1"]),
|
|
# postcode=str(asset["postcode"]),
|
|
# auth_token=EPC_AUTH_TOKEN,
|
|
# os_api_key=ORDNANCE_SURVEY_API_KEY,
|
|
# full_address=str(asset["full_address"]),
|
|
# uprn=asset.get("uprn", None),
|
|
# )
|
|
# searcher.ordnance_survey_client.full_address = asset["full_address"]
|
|
# # Attempt to get places data with retry logic
|
|
# result = get_places_with_retry(searcher)
|
|
#
|
|
# if result:
|
|
# # Get the most relevant response
|
|
# os_most_relevant.append(
|
|
# {
|
|
# "internal_id": asset["internal_id"],
|
|
# **searcher.ordnance_survey_client.most_relevant_result
|
|
# }
|
|
# )
|
|
#
|
|
# # Also keep the best 100 results
|
|
# os_all[asset["internal_id"]] = searcher.ordnance_survey_client.results
|
|
# else:
|
|
# # Record the internal_id of the asset that failed
|
|
# print("Error for address: " + asset["full_address"])
|
|
# errors.append(asset["internal_id"])
|
|
|
|
# Store to S3
|
|
# save_data_to_s3(
|
|
# data=json.dumps(os_most_relevant),
|
|
# s3_file_name="customers/Stonewater/clustering/os_most_relevant_3.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
#
|
|
# save_data_to_s3(
|
|
# data=json.dumps(os_all),
|
|
# s3_file_name="customers/Stonewater/clustering/os_all_3.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
#
|
|
# save_data_to_s3(
|
|
# data=json.dumps(errors),
|
|
# s3_file_name="customers/Stonewater/clustering/errors_3.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
|
|
# We now collate all of the data for the following steps:
|
|
# 1) Checking the retrieve ordnance survey data against ordnance survey data
|
|
# 2) A second round of querying the EPC api to find the EPC data, in case we retrieve something using uprn
|
|
# 3) Predicting the EPC data for the properties we have no data for
|
|
# 4) Retrieveing additional data against the internal_id
|
|
# 5) Creation of final dataset for clustering
|
|
|
|
os_most_relevant = []
|
|
os_all = {}
|
|
for i in ["1", "2", "3"]:
|
|
most_relevant_segment = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
|
)
|
|
os_most_relevant.extend(json.loads(most_relevant_segment))
|
|
os_all_segment = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
|
)
|
|
os_all = {**os_all, **json.loads(os_all_segment)}
|
|
|
|
os_most_relevant = pd.DataFrame(os_most_relevant)
|
|
|
|
os_address_comparison = os_data_pull_asset_list[
|
|
["internal_id", "full_address", "postcode", "house_number", "address1"]
|
|
].merge(
|
|
os_most_relevant[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
|
|
how="inner",
|
|
on="internal_id"
|
|
)
|
|
|
|
# Compare house number
|
|
# Check for records where the postcode doesn't match
|
|
os_address_comparison["postcodes_match"] = (
|
|
os_address_comparison["postcode"].str.lower() == os_address_comparison["POSTCODE"].str.lower()
|
|
)
|
|
|
|
# extract it from ADDRESS
|
|
os_address_comparison["extracted_house_number"] = os_address_comparison["ADDRESS"].apply(
|
|
lambda x: SearchEpc.get_house_number(x)
|
|
)
|
|
|
|
# Compare house number
|
|
os_address_comparison["house_numbers_match"] = (
|
|
os_address_comparison["house_number"].str.lower() == os_address_comparison["extracted_house_number"].str.lower()
|
|
)
|
|
|
|
# String similarity
|
|
os_address_comparison["address_similarity_score"] = os_address_comparison.apply(
|
|
lambda x: fuzz.ratio(
|
|
remove_commas_and_full_stops(x["full_address"].lower()),
|
|
remove_commas_and_full_stops(x["ADDRESS"].lower())
|
|
),
|
|
axis=1
|
|
)
|
|
|
|
os_address_comparison = os_address_comparison.sort_values("address_similarity_score", ascending=True)
|
|
|
|
problematic = os_address_comparison.copy()
|
|
|
|
problematic = problematic[
|
|
(problematic["address_similarity_score"] <= 80) |
|
|
(~problematic["house_numbers_match"]) |
|
|
(~problematic["postcodes_match"])
|
|
]
|
|
|
|
# TODO: We'll label these problematic records as problematic, in the final output
|
|
|
|
# different_postcodes = problematic[~problematic["postcodes_match"]].copy().reset_index(drop=True)
|
|
|
|
ORDNANCE_SURVEY_API_KEY = "" # This API key is a temp key which I have copied locally
|
|
problematic_os = []
|
|
problematic_os_all = {}
|
|
problematic_errors = []
|
|
for _, row in tqdm(problematic.iterrows(), total=len(problematic)):
|
|
# Let's just do a backup pull - we're now using LPI too
|
|
time.sleep(2)
|
|
backup_searher = SearchEpc(
|
|
address1=row["address1"],
|
|
postcode=row["postcode"],
|
|
auth_token=EPC_AUTH_TOKEN,
|
|
os_api_key=ORDNANCE_SURVEY_API_KEY,
|
|
uprn=None,
|
|
)
|
|
# Attempt to get places data with retry logic
|
|
result = get_places_with_retry(backup_searher)
|
|
|
|
if result:
|
|
# Get the most relevant response
|
|
problematic_os.append(
|
|
{
|
|
"internal_id": row["internal_id"],
|
|
**backup_searher.ordnance_survey_client.most_relevant_result
|
|
}
|
|
)
|
|
|
|
# Also keep the best 100 results
|
|
problematic_os_all[row["internal_id"]] = backup_searher.ordnance_survey_client.results
|
|
else:
|
|
# Record the internal_id of the asset that failed
|
|
print("Error for address: " + row["full_address"])
|
|
problematic_errors.append(row["internal_id"])
|
|
|
|
# Store to S3
|
|
# save_data_to_s3(
|
|
# data=json.dumps(problematic_os),
|
|
# s3_file_name="customers/Stonewater/clustering/problematic_os.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
#
|
|
# save_data_to_s3(
|
|
# data=json.dumps(problematic_os_all),
|
|
# s3_file_name="customers/Stonewater/clustering/problematic_os_all.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
#
|
|
# save_data_to_s3(
|
|
# data=json.dumps(problematic_errors),
|
|
# s3_file_name="customers/Stonewater/clustering/problematic_errors.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
|
|
# Next steps: We should collate all of the data and produce 1 big dataset
|
|
|
|
problematic_os_df = pd.DataFrame(problematic_os)
|
|
problematic_address_comparison = problematic[["internal_id", "full_address", "postcode", "house_number"]].merge(
|
|
problematic_os_df[["internal_id", "ADDRESS", "POSTCODE", "UPRN"]],
|
|
how="inner",
|
|
on="internal_id"
|
|
)
|
|
|
|
problematic_address_comparison["OS_POSTCODE"] = problematic_address_comparison["ADDRESS"].str.split(", ").str[-1]
|
|
problematic_address_comparison["postcodes_match"] = (
|
|
problematic_address_comparison["postcode"].str.lower() == problematic_address_comparison[
|
|
"OS_POSTCODE"].str.lower()
|
|
)
|
|
|
|
problematic_address_comparison["match_similarity_score"] = problematic_address_comparison.apply(
|
|
lambda x: fuzz.ratio(
|
|
remove_commas_and_full_stops(x["full_address"].lower()),
|
|
remove_commas_and_full_stops(x["ADDRESS"].lower())
|
|
),
|
|
axis=1
|
|
)
|
|
problematic_address_comparison = problematic_address_comparison.sort_values(
|
|
"match_similarity_score", ascending=True
|
|
)
|
|
|
|
# let's do a house number extraction
|
|
problematic_address_comparison["extracted_house_number"] = problematic_address_comparison.apply(
|
|
lambda x: SearchEpc.get_house_number(x["ADDRESS"], x["OS_POSTCODE"]), axis=1
|
|
)
|
|
|
|
problematic_address_comparison["house_numbers_different"] = (
|
|
problematic_address_comparison["house_number"].str.lower().str.split(",").str[0].str.split(" ").str[0] !=
|
|
problematic_address_comparison[
|
|
"extracted_house_number"].str.lower()
|
|
)
|
|
|
|
# We perform a final check
|
|
# Take anything where the postcodes don't match, where the house numbers are different and the match similarity
|
|
# is less than 90, or the match similarity is less than 80
|
|
final_check = problematic_address_comparison[
|
|
(~problematic_address_comparison["postcodes_match"])
|
|
]
|
|
final_check = final_check.sort_values("match_similarity_score", ascending=False)
|
|
final_check = final_check.reset_index(drop=True)
|
|
|
|
final_best_matches = []
|
|
no_matches = []
|
|
for _, row in final_check.iterrows():
|
|
os_data = problematic_os_all[row["internal_id"]]
|
|
os_data = pd.DataFrame(
|
|
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_data]
|
|
)
|
|
|
|
if ("POSTCODE_LOCATOR" in os_data.columns) and ("POSTCODE" in os_data.columns):
|
|
os_data["postcode"] = np.where(
|
|
~pd.isnull(os_data["POSTCODE"]),
|
|
os_data["POSTCODE"],
|
|
os_data["POSTCODE_LOCATOR"]
|
|
)
|
|
elif "POSTCODE" in os_data.columns:
|
|
os_data["postcode"] = os_data["POSTCODE"]
|
|
else:
|
|
os_data["postcode"] = os_data["POSTCODE_LOCATOR"]
|
|
os_data = os_data[os_data["postcode"].str.lower() == row["postcode"].lower()]
|
|
if os_data.shape[0] >= 1:
|
|
final_best_matches.append(
|
|
{
|
|
"internal_id": row["internal_id"],
|
|
**os_data.iloc[0].to_dict()
|
|
}
|
|
)
|
|
else:
|
|
no_matches.append(
|
|
{
|
|
"internal_id": row["internal_id"],
|
|
"full_address": row["full_address"],
|
|
"postcode": row["postcode"]
|
|
}
|
|
)
|
|
|
|
no_matches = pd.DataFrame(no_matches)
|
|
|
|
# Data to be confirmed
|
|
from etl.customers.stonewater.no_matches import no_matches
|
|
no_matches_to_export = pd.DataFrame(no_matches)
|
|
no_matches_to_export = asset_list.merge(
|
|
no_matches_to_export[["internal_id", "Note"]],
|
|
how="inner",
|
|
on="internal_id"
|
|
).rename(
|
|
columns={
|
|
"internal_id": "Osm. ID",
|
|
"customer_asset_id": "Org. ref.",
|
|
"external_address_id": "Address ID",
|
|
}
|
|
)
|
|
no_matches_to_export.to_excel("Stonewater - addresses with no matches.xlsx", index=False)
|
|
|
|
# We also confirm final_best_matches
|
|
final_best_matches_df = pd.DataFrame(final_best_matches)[
|
|
["internal_id", "ADDRESS", "UPRN"]
|
|
].rename(
|
|
columns={
|
|
"ADDRESS": "Ordnance Survey Address - same postcode (best match)",
|
|
"UPRN": "UPRN - same postcode (best match)"
|
|
}
|
|
)
|
|
# We also get their original match
|
|
final_best_matches_df = final_best_matches_df.merge(
|
|
problematic[["internal_id", "ADDRESS", "UPRN"]].rename(
|
|
columns={
|
|
"ADDRESS": "Ordnance Survey Address - best possible match",
|
|
"UPRN": "UPRN - best possible match"
|
|
}
|
|
),
|
|
how="inner",
|
|
on="internal_id"
|
|
)
|
|
|
|
# merge on the original data
|
|
final_best_matches_df = asset_list.merge(
|
|
final_best_matches_df,
|
|
how="inner",
|
|
on="internal_id"
|
|
).rename(
|
|
columns={
|
|
"internal_id": "Osm. ID",
|
|
"customer_asset_id": "Org. ref.",
|
|
"external_address_id": "Address ID",
|
|
}
|
|
)
|
|
|
|
# "Osm. ID": "internal_id",
|
|
# "Org. ref.": "customer_asset_id",
|
|
# "Postcode": "postcode",
|
|
# "House no": "house_number",
|
|
# "Name": "address1",
|
|
# "Address line 2": "address2",
|
|
# "City/Town": "city_town",
|
|
# "County": "county",
|
|
# "Address ID": "external_address_id",
|
|
|
|
|
|
def filter_os_data(p_os_data, p_os_data_all, udprn, is_flat):
|
|
if udprn is None:
|
|
p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
|
|
if is_flat:
|
|
p_os_data_all = p_os_data_all[p_os_data_all["CLASSIFICATION_CODE"] == "RD06"]
|
|
return p_os_data_all.head(1)
|
|
|
|
return p_os_data_all.head(1)
|
|
|
|
final_os_data = p_os_data[p_os_data["UDPRN"] == udprn]
|
|
if final_os_data.empty:
|
|
p_os_data_all = pd.DataFrame([z["DPA"] if "DPA" in z else z["LPI"] for z in p_os_data_all])
|
|
final_os_data = p_os_data_all[p_os_data_all["UDPRN"].astype(str) == udprn]
|
|
|
|
return final_os_data
|
|
|
|
|
|
def compile_data():
|
|
"""
|
|
Various data sources have been produced to create the final data source for Stonewater.
|
|
This function combines them
|
|
:return:
|
|
"""
|
|
########################################################################
|
|
# Read in data
|
|
########################################################################
|
|
# asset_list = read_excel_from_s3(
|
|
# file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
|
# bucket_name="retrofit-data-dev",
|
|
# header_row=4
|
|
# )
|
|
#
|
|
# udprn_data = read_excel_from_s3(
|
|
# file_key="customers/Stonewater/UDPRN updated RA Sample for 5 year programme.xlsx",
|
|
# bucket_name="retrofit-data-dev",
|
|
# header_row=0
|
|
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
|
|
|
|
asset_list = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
|
header=4
|
|
)
|
|
|
|
udprn_data = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
|
|
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
|
|
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
|
|
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
|
|
|
|
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
|
|
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
|
|
|
|
# Read in the lookups
|
|
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
|
|
)))
|
|
|
|
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
|
|
)))
|
|
uprn_lookup_2 = uprn_lookup_2.rename(
|
|
columns={
|
|
"epc_address": "standardised_address",
|
|
"epc_postcode": "standardised_postcode"
|
|
}
|
|
)
|
|
|
|
# concat
|
|
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
|
|
|
|
# TODO: Read in UPRNs or UDPRN
|
|
# UPRN LOOKUPS TO READ IN: address_uprn_udprn_lookup, address_uprn_udprn_lookup_2
|
|
|
|
epc_data = json.loads(
|
|
read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/epc_data.json"
|
|
)
|
|
)
|
|
epc_data = pd.DataFrame(epc_data)
|
|
|
|
# We drop come EPCS
|
|
epc_data = epc_data[~epc_data["internal_id"].isin(internal_id_epcs_to_drop)]
|
|
|
|
# This we can use to produce additional variables such as number of old surveys
|
|
# older_epc_data = json.loads(
|
|
# read_from_s3(
|
|
# bucket_name="retrofit-data-dev",
|
|
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
|
|
# )
|
|
# )
|
|
# older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
|
|
|
|
# This is the first ordnance survey data pull
|
|
os_most_relevant_1 = []
|
|
os_all_1 = {}
|
|
for i in tqdm(["1", "2", "3"]):
|
|
most_relevant_segment = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
|
)
|
|
os_most_relevant_1.extend(json.loads(most_relevant_segment))
|
|
os_all_segment = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
|
)
|
|
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
|
|
|
|
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
|
|
|
|
# This is the second ordnance survey data pull
|
|
os_most_relevant_2 = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
|
|
)
|
|
os_most_relevant_2 = json.loads(os_most_relevant_2)
|
|
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
|
|
|
|
os_all_2 = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
|
|
)
|
|
os_all_2 = json.loads(os_all_2)
|
|
|
|
########################################################################
|
|
# Prepare asset list
|
|
########################################################################
|
|
# TODO: Merge on UPRNs
|
|
|
|
# Keep just the columns we're interested in
|
|
asset_list = asset_list[
|
|
[
|
|
"Osm. ID",
|
|
"Org. ref.",
|
|
"Postcode",
|
|
"House no",
|
|
"Name",
|
|
"Address line 2",
|
|
"City/Town",
|
|
"County",
|
|
"Address ID", # This is not uprn
|
|
"udprn"
|
|
]
|
|
].rename(
|
|
columns={
|
|
"Osm. ID": "internal_id",
|
|
"Org. ref.": "customer_asset_id",
|
|
"Postcode": "postcode",
|
|
"House no": "house_number",
|
|
"Name": "address1",
|
|
"Address line 2": "address2",
|
|
"City/Town": "city_town",
|
|
"County": "county",
|
|
"Address ID": "external_address_id",
|
|
}
|
|
)
|
|
|
|
# Create full address
|
|
asset_list["full_address"] = np.where(
|
|
~pd.isnull(asset_list["address2"]),
|
|
(
|
|
asset_list["address1"] + ", " +
|
|
asset_list["address2"] + ", " +
|
|
asset_list["city_town"].str.title() + ", " +
|
|
# asset_list["county"] + ", " +
|
|
asset_list["postcode"]
|
|
),
|
|
asset_list["address1"] + ", " +
|
|
asset_list["city_town"].str.title() + ", " +
|
|
# asset_list["county"] + ", " +
|
|
asset_list["postcode"]
|
|
)
|
|
|
|
if pd.isnull(asset_list["full_address"]).sum():
|
|
raise ValueError("Missing full addresses")
|
|
|
|
# Merge on UDPRN
|
|
|
|
asset_list = asset_list.merge(
|
|
uprn_lookup.drop(columns=["udprn"]), how="left", on=["internal_id", "external_address_id"]
|
|
)
|
|
|
|
# This is everything without a uprn
|
|
|
|
# Quick check to see if we have os data for every property that doesn't have an EPC
|
|
without_epc = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
|
|
|
|
os_most_relevant_1_internal_ids = os_most_relevant_1["internal_id"].tolist()
|
|
os_most_relevant_2_internal_ids = os_most_relevant_2["internal_id"].tolist()
|
|
|
|
missing_os_data = []
|
|
for _, x in without_epc.iterrows():
|
|
# We would prioritise the data pulled the second time around
|
|
|
|
internal_id = x["internal_id"]
|
|
if internal_id in os_most_relevant_2_internal_ids:
|
|
continue
|
|
|
|
if internal_id in os_most_relevant_1_internal_ids:
|
|
continue
|
|
|
|
missing_os_data.append(internal_id)
|
|
|
|
if len(missing_os_data):
|
|
raise Exception("We don't have SOME data for each internal_id")
|
|
|
|
# Let's create a lookup table of internal_id, external_address_id, udprn, uprn, standardised_address
|
|
address_uprn_udprn_lookup = []
|
|
for _, x in without_epc.iterrows():
|
|
if pd.isnull(x["UDPRN"]):
|
|
continue
|
|
udprn = str(int(x["UDPRN"]))
|
|
internal_id = x["internal_id"]
|
|
|
|
is_flat = "flat" in x["address1"].lower()
|
|
|
|
# Get the OS data
|
|
final_os_data = pd.DataFrame()
|
|
if internal_id in os_most_relevant_1_internal_ids:
|
|
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
|
|
p_os_data_all = os_all_1[str(internal_id)]
|
|
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
|
|
|
|
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
|
|
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
|
|
p_os_data_all = os_all_2[str(internal_id)]
|
|
|
|
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn, is_flat)
|
|
|
|
if final_os_data.empty:
|
|
continue
|
|
|
|
if final_os_data.shape[0] != 1:
|
|
if final_os_data["UPRN"].nunique() > 1:
|
|
raise Exception("Investigate me")
|
|
|
|
address_uprn_udprn_lookup.append(
|
|
{
|
|
"internal_id": internal_id,
|
|
"external_address_id": x["external_address_id"],
|
|
"udprn": udprn,
|
|
"uprn": final_os_data["UPRN"].values[0],
|
|
"standardised_address": final_os_data["ADDRESS"].values[0],
|
|
"standardised_postcode": final_os_data["POSTCODE"].values[0]
|
|
}
|
|
)
|
|
|
|
# Store this lookup
|
|
# save_data_to_s3(
|
|
# data=json.dumps(address_uprn_udprn_lookup),
|
|
# s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
|
|
address_uprn_udprn_lookup = pd.DataFrame(address_uprn_udprn_lookup)
|
|
missed = asset_list[~asset_list["internal_id"].isin(address_uprn_udprn_lookup["internal_id"].values)]
|
|
|
|
address_comparison = (
|
|
asset_list[
|
|
["internal_id", "external_address_id", "UDPRN", "full_address", "postcode", "house_number", "address1"]
|
|
].merge(
|
|
epc_data[["internal_id", "address", "postcode", "address1", "uprn"]].rename(
|
|
columns={
|
|
"address": "epc_address",
|
|
"postcode": "epc_postcode",
|
|
"address1": "epc_address1"
|
|
}
|
|
),
|
|
how="inner",
|
|
on="internal_id"
|
|
)
|
|
)
|
|
|
|
address_comparison["address_similarity_score"] = address_comparison.apply(
|
|
lambda x: fuzz.ratio(
|
|
remove_commas_and_full_stops(x["address1"].lower() + x["postcode"].lower()),
|
|
remove_commas_and_full_stops(x["epc_address1"].lower() + x["epc_postcode"].lower())
|
|
),
|
|
axis=1
|
|
)
|
|
address_comparison = address_comparison.sort_values("address_similarity_score", ascending=False)
|
|
# Cond
|
|
confident = address_comparison[address_comparison["address_similarity_score"] >= 95]
|
|
low_confidence = address_comparison[address_comparison["address_similarity_score"] < 95].copy()
|
|
|
|
lookup_2 = confident[
|
|
[
|
|
'internal_id', 'external_address_id', 'UDPRN', 'uprn',
|
|
'epc_address', 'epc_postcode']
|
|
].rename(columns={"UDPRN": "udprn"})
|
|
|
|
# Store in S3
|
|
# save_data_to_s3(
|
|
# data=json.dumps(lookup_2.to_dict("records")),
|
|
# s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
|
|
# Need to deal with the low confidence records
|
|
low_confidence_asset_list = asset_list[asset_list["internal_id"].isin(low_confidence["internal_id"])]
|
|
for _, x in low_confidence_asset_list.iterrows():
|
|
udprn = str(int(x["UDPRN"]))
|
|
internal_id = x["internal_id"]
|
|
# Get the OS data
|
|
final_os_data = pd.DataFrame()
|
|
if internal_id in os_most_relevant_1_internal_ids:
|
|
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id]
|
|
p_os_data_all = os_all_1[str(internal_id)]
|
|
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)
|
|
|
|
if (internal_id in os_most_relevant_2_internal_ids) and final_os_data.empty:
|
|
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id]
|
|
p_os_data_all = os_all_2[str(internal_id)]
|
|
|
|
final_os_data = filter_os_data(p_os_data, p_os_data_all, udprn)
|
|
|
|
# For the EPC data, some of them are missing UPRN
|
|
epc_data = epc_data.merge(missing_uprn_map, how="left", on="internal_id")
|
|
epc_data["uprn"] = np.where(
|
|
epc_data["uprn"] == "",
|
|
epc_data["mapped_uprn"],
|
|
epc_data["uprn"]
|
|
)
|
|
epc_data = epc_data.drop(columns=["mapped_uprn"])
|
|
|
|
# Once we have UPRNs, we might want to pull in the EPC data again
|
|
# epc_data_with_uprn = []
|
|
# older_epc_data_with_uprn = {}
|
|
#
|
|
# for row_number, asset in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
|
# searcher = SearchEpc(
|
|
# address1=str(asset["address1"]),
|
|
# postcode=str(asset["postcode"]),
|
|
# auth_token=EPC_AUTH_TOKEN,
|
|
# os_api_key="",
|
|
# full_address=str(asset["full_address"]),
|
|
# uprn=asset["uprn"]
|
|
# )
|
|
# searcher.find_property(skip_os=True)
|
|
#
|
|
# if searcher.newest_epc is None:
|
|
# continue
|
|
#
|
|
# epc_data_with_uprn.append(
|
|
# {
|
|
# "internal_id": asset["internal_id"],
|
|
# **searcher.newest_epc
|
|
# }
|
|
# )
|
|
#
|
|
# if searcher.older_epcs is not None:
|
|
# older_epc_data_with_uprn[asset["internal_id"]] = searcher.older_epcs
|
|
|
|
# We now get the remaining properties
|
|
# TODO: We might want to use epc_data_with_uprn
|
|
remaining_properties = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
|
|
|
|
# We estimate the data
|
|
final_epcs = []
|
|
for _, p in remaining_properties.iterrows():
|
|
internal_id = p["internal_id"]
|
|
uprn = p["UPRN"]
|
|
|
|
if internal_id in os_most_relevant_1_internal_ids:
|
|
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id].to_dict("records")[0]
|
|
p_os_full = os_all_1[str(internal_id)]
|
|
else:
|
|
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id].to_dict("records")[0]
|
|
p_os_full = os_all_2[str(internal_id)]
|
|
p_os_full = pd.DataFrame(
|
|
[x["DPA"] if "DPA" in x else x["LPI"] for x in p_os_full]
|
|
)
|
|
|
|
# TODO: Add this back in
|
|
# When we have this
|
|
if p["uprn"] != p_os_data["UPRN"]:
|
|
# Get it from the older data
|
|
filtered = p_os_full[p_os_full["UPRN"] == p["uprn"]]
|
|
p_os_data = filtered.to_dict("records")[0]
|
|
|
|
searcher = SearchEpc(
|
|
address1=str(p["address1"]),
|
|
postcode=str(p["postcode"]),
|
|
auth_token=EPC_AUTH_TOKEN,
|
|
os_api_key="",
|
|
uprn=uprn
|
|
)
|
|
searcher.ordnance_survey_client.parse_classification_code(p_os_data["CLASSIFICATION_CODE"])
|
|
|
|
searcher.find_property(skip_os=True)
|
|
|
|
final_epcs.append(
|
|
{
|
|
"internal_id": internal_id,
|
|
**searcher.newest_epc
|
|
}
|
|
)
|
|
|
|
final_epcs = pd.DataFrame(final_epcs)
|
|
|
|
complete_epcs = pd.concat(
|
|
[
|
|
epc_data,
|
|
final_epcs
|
|
]
|
|
)
|
|
|
|
# We now pull spatial data
|
|
# We get the spatial file list and loop through each EPC and determine which file it needs.
|
|
# We then just read in the files that we need and get the data, for each uprn from that file
|
|
|
|
uprn_filenames = read_dataframe_from_s3_parquet(
|
|
bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
|
|
)
|
|
|
|
uprn_lookup = {}
|
|
for uprn in complete_epcs["uprn"]:
|
|
if pd.isnull(uprn):
|
|
# TODO: Do something about this!
|
|
continue
|
|
filtered_df = uprn_filenames[
|
|
(uprn_filenames["lower"] <= int(uprn))
|
|
& (uprn_filenames["upper"] >= int(uprn))
|
|
]
|
|
if filtered_df["filenames"].values[0] in uprn_lookup:
|
|
uprn_lookup[filtered_df["filenames"].values[0]].append(int(uprn))
|
|
else:
|
|
uprn_lookup[filtered_df["filenames"].values[0]] = [int(uprn)]
|
|
|
|
spatial_data_to_uprn = []
|
|
for filename, associated_uprn in tqdm(uprn_lookup.items(), total=len(uprn_lookup)):
|
|
# Read in the file
|
|
spatial_data = read_dataframe_from_s3_parquet(
|
|
bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}"
|
|
)
|
|
|
|
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
|
|
spatial_data_to_uprn.append(spatial_df)
|
|
|
|
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
|
|
|
# TODO: Let's store this in s3
|
|
# save_data_to_s3(
|
|
# data=json.dumps(spatial_data_to_uprn.to_dict("records")),
|
|
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
|
|
# We merge this spatial data onto final EPCS
|
|
|
|
|
|
def concatenate_row(row):
|
|
return ', '.join(row.dropna().replace('', None).dropna().astype(str))
|
|
|
|
|
|
def adjust_clusters(cluster_allocation, total_clusters):
|
|
current_total = sum(cluster_allocation.values())
|
|
adjustment = total_clusters - current_total
|
|
if adjustment > 0:
|
|
# Increase clusters, start from the largest group
|
|
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
|
|
cluster_allocation[group] += 1
|
|
adjustment -= 1
|
|
if adjustment == 0:
|
|
break
|
|
elif adjustment < 0:
|
|
# Decrease clusters, start from the largest group
|
|
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
|
|
cluster_allocation[group] -= 1
|
|
adjustment += 1
|
|
if adjustment == 0:
|
|
break
|
|
return cluster_allocation
|
|
|
|
|
|
def compile_data_final():
|
|
# Updated version:
|
|
|
|
"""
|
|
Various data sources have been produced to create the final data source for Stonewater.
|
|
This function combines them
|
|
:return:
|
|
"""
|
|
########################################################################
|
|
# Read in data
|
|
########################################################################
|
|
|
|
asset_list = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
|
header=4
|
|
)
|
|
|
|
udprn_data = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
|
|
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
|
|
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
|
|
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
|
|
|
|
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
|
|
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
|
|
|
|
# Read in the lookups
|
|
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
|
|
)))
|
|
uprn_lookup_1["match_type"] = "Exact"
|
|
|
|
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
|
|
)))
|
|
uprn_lookup_2 = uprn_lookup_2.rename(
|
|
columns={
|
|
"epc_address": "standardised_address",
|
|
"epc_postcode": "standardised_postcode"
|
|
}
|
|
)
|
|
uprn_lookup_2["match_type"] = "EPC"
|
|
uprn_lookup_2["uprn"] = np.where(
|
|
uprn_lookup_2["internal_id"] == 1091,
|
|
83143766,
|
|
uprn_lookup_2["uprn"]
|
|
)
|
|
|
|
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
|
|
)))
|
|
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
|
|
concatenate_row, axis=1
|
|
)
|
|
uprn_lookup_3 = uprn_lookup_3[
|
|
["udprn", "uprn", "standardised_address", "postcode"]
|
|
].rename(columns={"postcode": "standardised_postcode"})
|
|
uprn_lookup_3["match_type"] = "Exact"
|
|
|
|
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
|
|
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
|
|
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
|
|
# prepare lookup 4
|
|
uprn_lookup_4 = []
|
|
for _, x in uprn_lookup_4_basis.iterrows():
|
|
|
|
property_type = None
|
|
built_form = None
|
|
if x["option"] == 1:
|
|
uprn = x["os_option_1_uprn"]
|
|
standardised_address = x["os_option_1_address"]
|
|
postcode = x["os_option_1_postcode"]
|
|
elif x["option"] == 2:
|
|
uprn = x["os_option_2_uprn"]
|
|
standardised_address = x["os_option_2_address"]
|
|
postcode = x["os_option_2_address"].split(", ")[-1]
|
|
else:
|
|
uprn = x["manual_uprn"]
|
|
standardised_address = x["manual_address"]
|
|
postcode = x["manual_postcode"]
|
|
|
|
uprn_lookup_4.append(
|
|
{
|
|
"internal_id": x["internal_id"],
|
|
"external_address_id": x["external_address_id"],
|
|
"uprn": uprn,
|
|
"standardised_address": standardised_address,
|
|
"standardised_postcode": postcode,
|
|
"property_type": property_type,
|
|
"built_form": built_form
|
|
}
|
|
)
|
|
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
|
|
uprn_lookup_4["match_type"] = "Fuzzy"
|
|
|
|
# concat
|
|
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
|
|
|
|
# We now merge all of the UPRNs onto the asset list
|
|
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
|
|
|
|
epc_data = json.loads(
|
|
read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/epc_data.json"
|
|
)
|
|
)
|
|
epc_data = pd.DataFrame(epc_data)
|
|
|
|
epc_data["uprn"] = np.where(
|
|
epc_data["internal_id"] == 1091,
|
|
83143766,
|
|
epc_data["uprn"]
|
|
)
|
|
|
|
# We drop come EPCS
|
|
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
|
|
|
|
# This we can use to produce additional variables such as number of old surveys
|
|
# older_epc_data = json.loads(
|
|
# read_from_s3(
|
|
# bucket_name="retrofit-data-dev",
|
|
# s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
|
|
# )
|
|
# )
|
|
# older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
|
|
|
|
########################################################################
|
|
# Prepare asset list
|
|
########################################################################
|
|
|
|
# Keep just the columns we're interested in
|
|
asset_list = asset_list[
|
|
[
|
|
"Osm. ID",
|
|
"Org. ref.",
|
|
"Postcode",
|
|
"House no",
|
|
"Name",
|
|
"Address line 2",
|
|
"City/Town",
|
|
"County",
|
|
"Address ID", # This is not uprn
|
|
"udprn",
|
|
"Owning body"
|
|
]
|
|
].rename(
|
|
columns={
|
|
"Osm. ID": "internal_id",
|
|
"Org. ref.": "customer_asset_id",
|
|
"Postcode": "postcode",
|
|
"House no": "house_number",
|
|
"Name": "address1",
|
|
"Address line 2": "address2",
|
|
"City/Town": "city_town",
|
|
"County": "county",
|
|
"Address ID": "external_address_id",
|
|
"Owning body": "owner"
|
|
}
|
|
)
|
|
|
|
# Create full address
|
|
asset_list["full_address"] = np.where(
|
|
~pd.isnull(asset_list["address2"]),
|
|
(
|
|
asset_list["address1"] + ", " +
|
|
asset_list["address2"] + ", " +
|
|
asset_list["city_town"].str.title() + ", " +
|
|
# asset_list["county"] + ", " +
|
|
asset_list["postcode"]
|
|
),
|
|
asset_list["address1"] + ", " +
|
|
asset_list["city_town"].str.title() + ", " +
|
|
# asset_list["county"] + ", " +
|
|
asset_list["postcode"]
|
|
)
|
|
|
|
if pd.isnull(asset_list["full_address"]).sum():
|
|
raise ValueError("Missing full addresses")
|
|
|
|
# Final preps of lookups
|
|
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
|
|
uprn_lookup_3 = uprn_lookup_3.merge(
|
|
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
|
|
)
|
|
uprn_lookup = pd.concat([
|
|
uprn_lookup,
|
|
uprn_lookup_3,
|
|
uprn_lookup_4
|
|
])
|
|
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
|
|
|
|
asset_list = asset_list.merge(
|
|
uprn_lookup.drop(columns=["udprn"]),
|
|
how="inner",
|
|
on=["internal_id", "external_address_id"]
|
|
)
|
|
|
|
# Store locally
|
|
# asset_list.to_excel("Stonewater asset list with uprn.xlsx")
|
|
|
|
# We take just domestic properties
|
|
|
|
# This is the first ordnance survey data pull
|
|
os_most_relevant_1 = []
|
|
os_all_1 = {}
|
|
for i in tqdm(["1", "2", "3"]):
|
|
most_relevant_segment = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
|
)
|
|
os_most_relevant_1.extend(json.loads(most_relevant_segment))
|
|
os_all_segment = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
|
)
|
|
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
|
|
|
|
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
|
|
|
|
# This is the second ordnance survey data pull
|
|
os_most_relevant_2 = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
|
|
)
|
|
os_most_relevant_2 = json.loads(os_most_relevant_2)
|
|
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
|
|
|
|
os_all_2 = read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
|
|
)
|
|
os_all_2 = json.loads(os_all_2)
|
|
|
|
needs_epc_data = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"])]
|
|
|
|
os_1_ids = os_most_relevant_1["internal_id"].values
|
|
os_2_ids = os_most_relevant_2["internal_id"].values
|
|
|
|
epc_data_batch_2 = []
|
|
older_epcs_batch_2 = {}
|
|
for _, property in tqdm(needs_epc_data.iterrows(), total=len(needs_epc_data)):
|
|
if pd.isnull(property["uprn"]):
|
|
continue
|
|
searcher = SearchEpc(
|
|
address1=", ".join(property["standardised_address"].split(", ")[:-1]),
|
|
postcode=property["standardised_postcode"],
|
|
auth_token=EPC_AUTH_TOKEN,
|
|
os_api_key="",
|
|
full_address=property["standardised_address"],
|
|
uprn=property["uprn"]
|
|
)
|
|
searcher.find_property(skip_os=True)
|
|
|
|
if searcher.newest_epc is None and property["match_type"] == "Exact":
|
|
# Estimate!
|
|
# Get the OS data
|
|
p_os_df = pd.DataFrame()
|
|
if property["internal_id"] in os_1_ids:
|
|
p_os_df = pd.DataFrame(
|
|
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_1[str(property["internal_id"])]]
|
|
)
|
|
p_os_df = p_os_df[p_os_df["UPRN"].astype(str) == property["uprn"]]
|
|
|
|
if p_os_df.empty:
|
|
p_os_df = pd.DataFrame(
|
|
[x["DPA"] if "DPA" in x else x["LPI"] for x in os_all_2[str(property["internal_id"])]]
|
|
)
|
|
p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
|
|
|
|
if not p_os_df.empty:
|
|
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
|
|
else:
|
|
searcher.ordnance_survey_client.property_type = ""
|
|
# Now we estimate
|
|
searcher.newest_epc = searcher.estimate_epc(
|
|
property_type=searcher.ordnance_survey_client.property_type,
|
|
built_form=searcher.ordnance_survey_client.built_form,
|
|
lmks_to_drop=None,
|
|
exclude_old=True
|
|
)
|
|
|
|
elif searcher.newest_epc is None and property["match_type"] == "Fuzzy":
|
|
|
|
if "flat" in property["standardised_address"].lower():
|
|
searcher.newest_epc = searcher.estimate_epc(
|
|
property_type="Flat",
|
|
built_form=None,
|
|
lmks_to_drop=None,
|
|
exclude_old=True
|
|
)
|
|
else:
|
|
searcher.newest_epc = searcher.estimate_epc(
|
|
property_type="House",
|
|
built_form=None,
|
|
lmks_to_drop=None,
|
|
exclude_old=True
|
|
)
|
|
|
|
epc_data_batch_2.append(
|
|
{
|
|
"internal_id": property["internal_id"],
|
|
**searcher.newest_epc
|
|
}
|
|
)
|
|
|
|
if searcher.older_epcs is not None:
|
|
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
|
|
# Store in S3
|
|
# TODO - read in instead of running
|
|
# save_pickle_to_s3(
|
|
# data=epc_data_batch_2,
|
|
# s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
#
|
|
# save_pickle_to_s3(
|
|
# data=older_epcs_batch_2,
|
|
# s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
|
|
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
|
|
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
|
|
|
|
# We now prepare the final data for clustering
|
|
uprn_filenames = read_dataframe_from_s3_parquet(
|
|
bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
|
|
)
|
|
|
|
uprn_map = {}
|
|
for uprn in complete_epcs["uprn"]:
|
|
filtered_df = uprn_filenames[
|
|
(uprn_filenames["lower"] <= int(uprn))
|
|
& (uprn_filenames["upper"] >= int(uprn))
|
|
]
|
|
if filtered_df["filenames"].values[0] in uprn_map:
|
|
uprn_map[filtered_df["filenames"].values[0]].append(int(uprn))
|
|
else:
|
|
uprn_map[filtered_df["filenames"].values[0]] = [int(uprn)]
|
|
|
|
spatial_data_to_uprn = []
|
|
for filename, associated_uprn in tqdm(uprn_map.items(), total=len(uprn_map)):
|
|
# Read in the file
|
|
spatial_data = read_dataframe_from_s3_parquet(
|
|
bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}"
|
|
)
|
|
|
|
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
|
|
spatial_data_to_uprn.append(spatial_df)
|
|
|
|
# TODO: Let's store this in s3
|
|
# save_pickle_to_s3(
|
|
# data=spatial_data_to_uprn,
|
|
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
|
|
# bucket_name="retrofit-data-dev"
|
|
# )
|
|
|
|
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
|
|
|
spatial_data_to_uprn = spatial_data_to_uprn.drop(
|
|
columns=["partition", "filename"]
|
|
).rename(columns={"UPRN": "uprn"})
|
|
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
|
|
|
|
property_attributes = complete_epcs.merge(
|
|
spatial_data_to_uprn,
|
|
how="inner",
|
|
on="uprn"
|
|
)
|
|
|
|
property_attributes = property_attributes.merge(
|
|
asset_list[["internal_id", "owner", "match_type"]], how="left", on="internal_id"
|
|
)
|
|
|
|
# TODO: Add on data from the asset list such as ownership
|
|
|
|
# We drop the columns we don't care about for clustering
|
|
property_attributes = property_attributes.drop(
|
|
columns=[
|
|
"address",
|
|
"uprn-source",
|
|
"heating-cost-potential",
|
|
"hot-water-cost-potential",
|
|
"potential-energy-rating",
|
|
"environment-impact-potential",
|
|
"address3",
|
|
"local-authority-label",
|
|
"sheating-energy-eff",
|
|
"local-authority-label",
|
|
"county",
|
|
"postcode",
|
|
"constituency",
|
|
"co2-emissions-potential",
|
|
"energy-consumption-potential",
|
|
"local-authority",
|
|
"inspection-date",
|
|
"address1",
|
|
"constituency-label",
|
|
"building-reference-number",
|
|
"floor-energy-eff",
|
|
"address2",
|
|
"posttown",
|
|
"floor-env-eff",
|
|
"sheating-env-eff",
|
|
"lighting-cost-potential",
|
|
"main-heating-controls",
|
|
"transaction-type",
|
|
"uprn",
|
|
"lodgement-date",
|
|
"lmk-key",
|
|
"wind-turbine-count",
|
|
"tenure",
|
|
"potential-energy-efficiency",
|
|
"glazed-area"
|
|
]
|
|
)
|
|
|
|
# Fields to transform: lodgement-datetime
|
|
property_attributes["days_since_last_epc"] = (
|
|
datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"], errors="coerce")
|
|
).dt.days
|
|
|
|
property_attributes = property_attributes.drop(columns=["lodgement-datetime"])
|
|
|
|
# Up to:
|
|
# Round averages to nearest integer
|
|
fill_with_average = [
|
|
"low-energy-fixed-light-count",
|
|
"floor-height",
|
|
"heating-cost-current",
|
|
"fixed-lighting-outlets-count",
|
|
"hot-water-cost-current",
|
|
"number-heated-rooms",
|
|
"co2-emiss-curr-per-floor-area",
|
|
"total-floor-area",
|
|
"environment-impact-current",
|
|
"co2-emissions-current",
|
|
"number-habitable-rooms",
|
|
"energy-consumption-current",
|
|
'lighting-cost-current',
|
|
"low-energy-lighting",
|
|
]
|
|
|
|
fill_with_mode = [
|
|
"multi-glaze-proportion",
|
|
"extension-count",
|
|
]
|
|
|
|
fill_with_zero = [
|
|
"unheated-corridor-length",
|
|
"number-open-fireplaces",
|
|
"photo-supply",
|
|
]
|
|
|
|
fill_with_categorical = {
|
|
"construction-age-band": "unknown",
|
|
"mainheat-energy-eff": "N/A",
|
|
"windows-env-eff": "N/A",
|
|
"lighting-energy-eff": "N/A",
|
|
"energy-tariff": 'NO DATA!',
|
|
"mechanical-ventilation": 'NO DATA!',
|
|
"solar-water-heating-flag": "N",
|
|
"mains-gas-flag": "N",
|
|
"heat-loss-corridor": "unknown",
|
|
"flat-storey-count": "Not a flat",
|
|
"roof-energy-eff": "N/A",
|
|
"hot-water-env-eff": "N/A",
|
|
"mainheatc-energy-eff": "N/A",
|
|
"main-fuel": 'NO DATA!',
|
|
"lighting-env-eff": "N/A",
|
|
"windows-energy-eff": "N/A",
|
|
"roof-env-eff": "N/A",
|
|
"walls-env-eff": "N/A",
|
|
"mainheat-env-eff": "N/A",
|
|
"flat-top-storey": "N",
|
|
"mainheatc-env-eff": "N",
|
|
"floor-level": "NODATA!",
|
|
"hot-water-energy-eff": "N/A",
|
|
"glazed-type": "unknown"
|
|
}
|
|
|
|
# Consolidation columns to single value
|
|
consolidation_columns = {
|
|
"glazed-type": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
|
|
"mechanical-ventilation": {"from": ['', 'NO DATA!', 'not defined', 'INVALID!'], "to": "unknown"},
|
|
"solar-water-heating-flag": {"from": [''], "to": "N"},
|
|
"mains-gas-flag": {"from": [''], "to": "N"},
|
|
"heat-loss-corridor": {"from": ['NO DATA!', ''], "to": "N"},
|
|
"flat-top-storey": {"from": [''], "to": "N"},
|
|
"floor-level": {"from": [""], "to": "NODATA!"}
|
|
}
|
|
|
|
# Perform the cleaning
|
|
for col in fill_with_average:
|
|
property_attributes[col] = property_attributes[col].replace('', None)
|
|
avg_val = np.mean([float(x) for x in property_attributes[col].values if x not in [None, "", np.nan]])
|
|
if pd.isnull(avg_val):
|
|
raise Exception("something went wrong")
|
|
property_attributes[col] = property_attributes[col].fillna(round(avg_val))
|
|
property_attributes[col] = property_attributes[col].astype(float)
|
|
|
|
for c in fill_with_zero:
|
|
property_attributes[c] = property_attributes[c].replace('', 0)
|
|
property_attributes[c] = property_attributes[c].fillna(0)
|
|
property_attributes[c] = property_attributes[c].astype(float)
|
|
|
|
for col in fill_with_mode:
|
|
property_attributes[col] = property_attributes[col].replace('', None)
|
|
mode_val = stats.mode([float(x) for x in property_attributes[col].values if x not in [None, "", np.nan]])[0]
|
|
if pd.isnull(mode_val):
|
|
raise Exception("something went wrong")
|
|
property_attributes[col] = property_attributes[col].fillna(mode_val)
|
|
property_attributes[col] = property_attributes[col].astype(float)
|
|
|
|
for c, fill_val in fill_with_categorical.items():
|
|
property_attributes[c] = property_attributes[c].replace('', fill_val)
|
|
property_attributes[c] = property_attributes[c].fillna(fill_val)
|
|
|
|
# Finally, consolidate
|
|
for c, consolidate_config in consolidation_columns.items():
|
|
for v in consolidate_config["from"]:
|
|
property_attributes[c] = property_attributes[c].replace(v, consolidate_config["to"])
|
|
|
|
property_attributes["estimated"] = property_attributes["estimated"].fillna(False)
|
|
property_attributes["conservation_status"] = property_attributes["conservation_status"].fillna(False)
|
|
property_attributes["days_since_last_epc"] = property_attributes["days_since_last_epc"].fillna(
|
|
property_attributes["days_since_last_epc"].mean()
|
|
)
|
|
|
|
missings = pd.isnull(property_attributes).sum()
|
|
missings = missings[missings > 0]
|
|
|
|
# Save this
|
|
# save_pickle_to_s3(
|
|
# data=property_attributes,
|
|
# bucket_name="retrofit-data-dev",
|
|
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
|
# )
|
|
|
|
# from utils.s3 import read_pickle_from_s3
|
|
# property_attributes = read_pickle_from_s3(
|
|
# bucket_name="retrofit-data-dev",
|
|
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
|
# )
|
|
|
|
# We perform some additional cleaning on the data
|
|
import msgpack
|
|
cleaned = read_from_s3(
|
|
s3_file_name="cleaned_epc_data/cleaned.bson",
|
|
bucket_name="retrofit-data-dev"
|
|
)
|
|
|
|
cleaned = msgpack.unpackb(cleaned, raw=False)
|
|
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
|
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
|
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
|
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
|
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
|
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
|
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
|
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
|
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
|
|
|
cleaners = {
|
|
"floor-description": FloorAttributes,
|
|
'hotwater-description': HotWaterAttributes,
|
|
'main-fuel': MainFuelAttributes,
|
|
'mainheat-description': MainHeatAttributes,
|
|
'mainheatcont-description': MainheatControlAttributes,
|
|
'roof-description': RoofAttributes,
|
|
'walls-description': WallAttributes,
|
|
'windows-description': WindowAttributes,
|
|
'lighting-description': LightingAttributes
|
|
}
|
|
|
|
for variable_to_clean in cleaned.keys():
|
|
|
|
unique_descriptions = property_attributes[variable_to_clean].unique()
|
|
clean_df = pd.DataFrame(cleaned[variable_to_clean])
|
|
# Check if we have any
|
|
missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
|
|
if missed:
|
|
descriptions_to_append = []
|
|
for description in missed:
|
|
if variable_to_clean == "lighting-description":
|
|
cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
|
|
else:
|
|
cln = cleaners[variable_to_clean](description)
|
|
to_append = {
|
|
"original_description": description,
|
|
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
|
|
**cln.process()
|
|
}
|
|
descriptions_to_append.append(to_append)
|
|
|
|
descriptions_to_append = pd.DataFrame(descriptions_to_append)
|
|
clean_df = pd.concat([clean_df, descriptions_to_append])
|
|
|
|
clean_df = clean_df.rename(
|
|
columns={
|
|
"thermal_transmittance": f"{variable_to_clean}_thermal_transmittance",
|
|
"is_assumed": f"{variable_to_clean}_is_assumed",
|
|
}
|
|
)
|
|
|
|
if 'thermal_transmittance_unit' in clean_df.columns:
|
|
clean_df = clean_df.drop(columns=['thermal_transmittance_unit'])
|
|
|
|
starting_size = len(property_attributes)
|
|
property_attributes = property_attributes.merge(
|
|
clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
|
|
)
|
|
if starting_size != property_attributes.shape[0]:
|
|
raise Exception("something went wrong")
|
|
property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
|
|
# Fill missings
|
|
for k in clean_df.columns:
|
|
if k in property_attributes.columns:
|
|
property_attributes[k] = property_attributes[k].fillna("missing")
|
|
|
|
# We group some variables such as thermal transmittance for walls, roof, floors
|
|
# ranges = {
|
|
# "< 0.1": (0, 0.1),
|
|
# "0.1 - 0.3": (0.1, 0.3),
|
|
# "0.3 - 0.5": (0.3, 0.5),
|
|
# "0.5 - 0.7": (0.5, 0.7),
|
|
# "0.9 - 1": (0.9, 1),
|
|
# "1 - 1.5": (1, 1.5),
|
|
# "1.5 - 2": (1.5, 2),
|
|
# "2+": (2, 2.5)
|
|
# }
|
|
|
|
ranges = {
|
|
"< 0.1": (0, 0.1),
|
|
"0.1 - 0.3": (0.1, 0.3),
|
|
"0.3 - 0.5": (0.3, 0.5),
|
|
"0.5+": (0.5, 2.5),
|
|
}
|
|
|
|
# Generate the lookup table
|
|
thermal_transmittance_lookup_table = []
|
|
for i in range(1, 251):
|
|
value = i / 100
|
|
for label, (low, high) in ranges.items():
|
|
if low < value <= high:
|
|
thermal_transmittance_lookup_table.append({"from": value, "to": label})
|
|
break
|
|
|
|
# Convert to DataFrame for display
|
|
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
|
|
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
|
|
|
|
thermal_transmittance_cols = [
|
|
c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c
|
|
]
|
|
for i, col in enumerate(thermal_transmittance_cols):
|
|
# Perform the mapping
|
|
to_col = f"to_{col}"
|
|
property_attributes[col] = property_attributes[col].astype(str)
|
|
property_attributes = property_attributes.merge(
|
|
thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
|
|
how="left",
|
|
left_on=col,
|
|
right_on="from",
|
|
suffixes=("", f"_{i}")
|
|
)
|
|
property_attributes = property_attributes.drop(columns=["from", col])
|
|
property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
|
|
|
|
# Drop the description columns that are the keys in cleaned
|
|
print("PUT ME BACK!!??")
|
|
property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
|
|
# Perform the mapping
|
|
|
|
# CLUSTERING!!
|
|
grouping_columns = [
|
|
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
|
|
]
|
|
|
|
additional_features = [
|
|
|
|
]
|
|
|
|
# Define the preprocessing for numerical and categorical features
|
|
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
|
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
|
|
|
|
for col in categorical_features:
|
|
property_attributes[col] = property_attributes[col].astype(str)
|
|
|
|
id_column = 'internal_id'
|
|
n_clusters = 450
|
|
random_state = 0
|
|
|
|
training_data_grouped = property_attributes.groupby(grouping_columns)
|
|
group_sizes = {name: len(group) for name, group in training_data_grouped}
|
|
total_size = sum(group_sizes.values())
|
|
cluster_allocation = {
|
|
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
|
|
}
|
|
|
|
# Adjust cluster allocation to ensure total clusters sum to 450
|
|
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
|
|
|
|
# TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to
|
|
# collect the results of the clustering and then perform the transformations afterwards
|
|
|
|
final_clusters = []
|
|
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
|
|
|
|
group_n_clusters = cluster_allocation[group_variables]
|
|
group_data.set_index(id_column, inplace=True)
|
|
|
|
preprocessor = ColumnTransformer(
|
|
transformers=[
|
|
('num', StandardScaler(), numerical_features),
|
|
('cat', OneHotEncoder(), categorical_features)
|
|
]
|
|
)
|
|
|
|
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
|
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
|
|
|
|
# Fit the pipeline to the data
|
|
pipeline.fit(group_data)
|
|
|
|
# Transform the data using the fitted pipeline
|
|
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
|
|
|
|
# Get cluster labels
|
|
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
|
|
|
|
# Get centroids (already in the same transformed space)
|
|
centroids = pipeline.named_steps['kmeans'].cluster_centers_
|
|
|
|
# if the data isn't an array, make it one
|
|
if not isinstance(processed_data, np.ndarray):
|
|
processed_data = processed_data.toarray()
|
|
|
|
# Calculate distances from each point to the centroid of its cluster
|
|
distances_to_centroids = [
|
|
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
|
|
for i, label in enumerate(group_data['cluster'])
|
|
]
|
|
|
|
group_data['distance_to_centroid'] = distances_to_centroids
|
|
|
|
# for cluster_id in group_data['cluster'].unique():
|
|
# cluster_data = group_data[group_data['cluster'] == cluster_id]
|
|
# min_distance = cluster_data['distance_to_centroid'].min()
|
|
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
|
|
# if min_distance != 0:
|
|
# print(f"No point with zero distance found in cluster {cluster_id}")
|
|
|
|
# Ranking rows by distance within each cluster
|
|
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
|
|
|
|
# Sorting to verify
|
|
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
|
|
group_data.reset_index(inplace=True)
|
|
|
|
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
|
|
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
|
|
final_clusters.append(to_append)
|
|
|
|
final_clusters = pd.concat(final_clusters)
|
|
# remap the clusters from the current names to 1 -> n_clusters
|
|
|
|
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
|
|
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
|
|
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
|
|
|
|
################################################
|
|
# Prepare outputs!!!!
|
|
################################################
|
|
|
|
property_attributes.reset_index(inplace=True)
|
|
property_attributes = property_attributes.merge(
|
|
final_clusters, how="left", on="internal_id"
|
|
)
|
|
property_attributes["archetype_representative"] = property_attributes["rank"] == 1
|
|
|
|
asset_list_with_archetypes = asset_list.merge(
|
|
property_attributes[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
|
|
on="internal_id"
|
|
)
|
|
|
|
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].fillna(-999)
|
|
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].astype(int).astype(str)
|
|
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].replace("-999", "NO ARCHETYPE")
|
|
|
|
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
|
|
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
|
|
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
|
|
|
|
asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
|
|
"archetype_representative"].fillna(False)
|
|
|
|
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False)
|
|
|
|
stonewater_uprn_lookup = asset_list_with_archetypes[
|
|
["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
|
|
]
|
|
|
|
stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
|
|
|
|
|
|
def pull_ideal_postcodes(missing_uprn_with_udprn):
|
|
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
|
|
import requests
|
|
import time
|
|
completed_id = 0
|
|
|
|
uprn_to_udprn = []
|
|
for row_index, data in tqdm(missing_uprn_with_udprn.iterrows(), total=len(missing_uprn_with_udprn)):
|
|
if row_index < completed_id:
|
|
continue
|
|
time.sleep(0.5)
|
|
|
|
# Call the API
|
|
udprn = data["udprn"]
|
|
|
|
url = f"https://api.ideal-postcodes.co.uk/v1/udprn/{udprn}?api_key={api_key}"
|
|
|
|
payload = {
|
|
"api_key": api_key
|
|
}
|
|
headers = {
|
|
'Accept': 'application/json'
|
|
}
|
|
|
|
response = requests.request("GET", url, headers=headers, data=payload)
|
|
if response.status_code != 200:
|
|
raise ValueError("API call dead")
|
|
|
|
result = response.json()
|
|
uprn_to_udprn.append(
|
|
result["result"]
|
|
)
|
|
completed_id += 1
|
|
|
|
|
|
def updated_version():
|
|
"""
|
|
This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
|
|
using fewer variables and also factoring in their internal data sources
|
|
|
|
This work began on the 23rd July 2024
|
|
:return:
|
|
"""
|
|
|
|
########################################################################
|
|
# Read in data
|
|
########################################################################
|
|
asset_list = read_asset_list()
|
|
asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list)
|
|
|
|
# Read in the properties that have been included in Osmosis' wave 2.1
|
|
osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
|
|
|
|
asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)
|
|
|
|
# We also check the address & postcode
|
|
asset_list["In Osmosis Wave 2.1"] = np.where(
|
|
asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
|
|
True,
|
|
asset_list["In Osmosis Wave 2.1"]
|
|
)
|
|
|
|
priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()
|
|
|
|
# Pull in the EPC data
|
|
epc_data = read_epc_data(uprn_lookup_2)
|
|
|
|
# Pull in the spatial data to UPRN
|
|
spatial_data_to_uprn = read_pickle_from_s3(
|
|
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
|
|
bucket_name="retrofit-data-dev"
|
|
)
|
|
|
|
# Function to convert specific columns to bool dtype
|
|
def convert_specific_columns_to_bool(df, columns):
|
|
for column in columns:
|
|
if column in df.columns:
|
|
df[column] = df[column].astype(bool)
|
|
return df
|
|
|
|
spatial_data_to_uprn = [convert_specific_columns_to_bool(
|
|
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
|
|
) for df in spatial_data_to_uprn]
|
|
|
|
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
|
spatial_data_to_uprn = spatial_data_to_uprn.drop(
|
|
columns=["partition", "filename"]
|
|
).rename(columns={"UPRN": "uprn"})
|
|
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
|
|
|
|
########################################################################
|
|
# Prepare the data
|
|
########################################################################
|
|
|
|
# Filter the asset list down to the priority postcodes
|
|
asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)
|
|
|
|
master_sheet = master_sheet[
|
|
master_sheet["Address ID"].isin(
|
|
asset_list["external_address_id"].values
|
|
)
|
|
]
|
|
|
|
master_sheet["days_since_lodgement"] = (
|
|
datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
|
|
).dt.days
|
|
|
|
asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
|
|
master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
|
|
how="left",
|
|
left_on="external_address_id",
|
|
right_on="Address ID"
|
|
)
|
|
|
|
asset_list = asset_list.merge(
|
|
epc_data[["internal_id", "current-energy-efficiency", "lodgement-date", "estimated"]],
|
|
how="left",
|
|
on="internal_id"
|
|
)
|
|
asset_list["days_since_lodgement_epc"] = (
|
|
datetime.now() - pd.to_datetime(asset_list["lodgement-date"], errors="coerce", dayfirst=True)
|
|
).dt.days
|
|
|
|
# Flag properties that were surveyed within the last 5 years
|
|
asset_list["epc_within_5_years"] = asset_list["days_since_lodgement_epc"] < 5 * 365
|
|
|
|
# Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
|
|
# a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
|
|
# the EPC is done
|
|
asset_list["is_epc_c_or_above"] = (
|
|
((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
|
|
(asset_list["EPC Rating"] >= 80)
|
|
)
|
|
|
|
clustering_features = asset_list[
|
|
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
|
|
~pd.isnull(asset_list["uprn"])
|
|
][
|
|
[
|
|
"internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
|
|
"city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
|
|
"epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
|
|
]
|
|
]
|
|
|
|
# Merge on the SAP data
|
|
clustering_features = clustering_features.merge(
|
|
master_sheet[
|
|
["Address ID", "SAP"]
|
|
].rename(columns={"SAP": "parity_modelled_sap"}),
|
|
how="left",
|
|
left_on="external_address_id",
|
|
right_on="Address ID"
|
|
)
|
|
|
|
# For SAP, we use the most recent EPC if epc_within_5_years is True, otherwise we use the parity modelled sap
|
|
clustering_features["current-energy-efficiency"] = clustering_features["current-energy-efficiency"].astype(float)
|
|
clustering_features["representative_sap"] = np.where(
|
|
clustering_features["epc_within_5_years"],
|
|
clustering_features["current-energy-efficiency"],
|
|
clustering_features["parity_modelled_sap"]
|
|
)
|
|
|
|
# We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
|
|
# is too many
|
|
clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
|
|
|
|
# Merge on spatial features
|
|
clustering_features = clustering_features.merge(
|
|
spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
|
|
how="left",
|
|
on="uprn"
|
|
)
|
|
|
|
# incorect_epcs = clustering_features[
|
|
# clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
|
|
# incorect_epcs = incorect_epcs[
|
|
# ~pd.isnull(incorect_epcs["current-energy-efficiency"]) & pd.isnull(incorect_epcs["estimated"])
|
|
# ]
|
|
# incorect_epcs = incorect_epcs.rename(columns={"current-energy-efficiency": "Current SAP Rating"})
|
|
# # Store data
|
|
# incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
|
|
|
|
# We add in the key features, which are used for clustering
|
|
master_sheet_clustering_features = master_sheet[
|
|
["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
|
|
].copy()
|
|
|
|
# Step 1: Remap walls - we end up with 11 types
|
|
master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
|
|
{
|
|
"TimberFrame: AsBuilt": "Other wall type, as built",
|
|
"SystemBuilt: AsBuilt": "Other wall type, as built",
|
|
"Sandstone: AsBuilt": "Other wall type, as built",
|
|
"Sandstone: Internal": "Other wall type, internal or external",
|
|
"SystemBuilt: External": "Other wall type, internal or external",
|
|
"GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
|
|
"TimberFrame: Internal": "Other wall type, internal or external",
|
|
"Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
|
|
"SystemBuilt: Internal": "Other wall type, internal or external",
|
|
"Cavity: Internal": "Other wall type, internal or external",
|
|
}
|
|
)
|
|
|
|
# Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
|
|
# gives us the insulation thickness
|
|
|
|
# Clean an incorrect value
|
|
master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
|
|
{
|
|
"PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
|
|
"PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
|
|
'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
|
|
'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
|
|
'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
|
|
}
|
|
)
|
|
|
|
master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
|
|
master_sheet_clustering_features['Roofs'].apply(
|
|
lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
|
|
)
|
|
)
|
|
|
|
# Strip any extra whitespace
|
|
master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
|
|
master_sheet_clustering_features['roof_insulation_thickness'] = (
|
|
master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
|
|
)
|
|
|
|
def map_thickness(thickness):
|
|
try:
|
|
value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
|
|
return "Above 250mm" if value > 250 else "Below 250mm"
|
|
except ValueError:
|
|
return thickness # Return the original value if it cannot be converted to a float
|
|
|
|
master_sheet_clustering_features['roof_insulation_category'] = (
|
|
master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
|
|
)
|
|
|
|
# Ideas
|
|
# 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
|
|
# as a secondary category
|
|
# 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
|
|
# (could split on :, take first part)
|
|
|
|
clustering_features = clustering_features.merge(
|
|
master_sheet_clustering_features,
|
|
how="left",
|
|
on="Address ID"
|
|
)
|
|
|
|
# Reduce down to the final set of features we need
|
|
clustering_features = clustering_features[
|
|
[
|
|
"internal_id",
|
|
"Property Type",
|
|
# Location
|
|
"postal_region",
|
|
'conservation_status',
|
|
'is_listed_building',
|
|
'is_heritage_building',
|
|
"county",
|
|
# Walls
|
|
"walls_reduced",
|
|
# Roof
|
|
"roof_type",
|
|
"roof_insulation_category",
|
|
# Heating
|
|
"Heating",
|
|
# Fuel
|
|
"Main Fuel",
|
|
"Age",
|
|
"Total Floor Area",
|
|
"representative_sap",
|
|
"days_since_lodgement",
|
|
]
|
|
]
|
|
|
|
clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
|
|
|
|
def split_property_type(row):
|
|
parts = row.split(':')
|
|
property_type = parts[0].strip()
|
|
built_form = parts[1].strip() if len(parts) > 1 else ''
|
|
property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
|
|
return pd.Series([property_type, built_form, property_extended_feature])
|
|
|
|
clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
|
|
clustering_features['Property Type'].apply(split_property_type)
|
|
)
|
|
clustering_features = clustering_features.drop(columns=["Property Type"])
|
|
|
|
# These are the variables we MUST split by
|
|
grouping_columns = [
|
|
"property_type",
|
|
"walls_reduced",
|
|
"roof_type",
|
|
"Main Fuel",
|
|
"county",
|
|
]
|
|
|
|
def combine_small_groups(clustering_features, grouping_columns, threshold=2):
|
|
# Identify small groups
|
|
group_sizes = clustering_features.groupby(grouping_columns).size()
|
|
small_groups = group_sizes[group_sizes <= threshold].index.tolist()
|
|
|
|
# Remove small groups from the original clustering_features
|
|
small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
|
|
clustering_features_ok = clustering_features[
|
|
~clustering_features.set_index(grouping_columns).index.isin(small_groups)
|
|
]
|
|
|
|
if small_group_data.empty:
|
|
return clustering_features
|
|
|
|
# One-Hot Encode categorical variables
|
|
categorical_features = (
|
|
clustering_features_ok.drop(columns=["internal_id"])
|
|
.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
)
|
|
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
|
ohe.fit(clustering_features_ok[categorical_features])
|
|
|
|
# Combine small groups with the nearest available group
|
|
small_group_ohe = ohe.transform(small_group_data[categorical_features])
|
|
large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])
|
|
|
|
numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
|
small_group_numerical = small_group_data[numerical_features].values
|
|
large_group_numerical = clustering_features_ok[numerical_features].values
|
|
|
|
# Concatenate one-hot encoded categorical and numerical features
|
|
small_group_features = np.hstack([small_group_ohe, small_group_numerical])
|
|
large_group_features = np.hstack([large_group_ohe, large_group_numerical])
|
|
|
|
# Calculate distances and find nearest groups
|
|
closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
|
|
closest_group_index = clustering_features_ok.iloc[closest_groups].index
|
|
|
|
# Update small groups to the nearest large group
|
|
for small_group, closest_group in zip(small_groups, closest_group_index):
|
|
small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
|
|
small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
|
|
closest_group, grouping_columns].values
|
|
|
|
combined_data = pd.concat([clustering_features_ok, small_group_data])
|
|
return combined_data
|
|
|
|
clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
|
|
|
|
########################################################################
|
|
# Clustering
|
|
########################################################################
|
|
numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
|
categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
|
|
|
|
for col in categorical_features:
|
|
clustering_features_combined[col] = clustering_features_combined[col].astype(str)
|
|
|
|
id_column = 'internal_id'
|
|
n_clusters = 450
|
|
random_state = 0
|
|
|
|
training_data_grouped = clustering_features_combined.groupby(grouping_columns)
|
|
group_sizes = {name: len(group) for name, group in training_data_grouped}
|
|
total_size = sum(group_sizes.values())
|
|
cluster_allocation = {
|
|
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
|
|
}
|
|
|
|
# Adjust cluster allocation to ensure total clusters sum to 450
|
|
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
|
|
|
|
final_clusters = []
|
|
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
|
|
|
|
group_n_clusters = cluster_allocation[group_variables]
|
|
group_data.set_index(id_column, inplace=True)
|
|
|
|
preprocessor = ColumnTransformer(
|
|
transformers=[
|
|
('num', StandardScaler(), numerical_features),
|
|
('cat', OneHotEncoder(), categorical_features)
|
|
]
|
|
)
|
|
|
|
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
|
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
|
|
|
|
# Fit the pipeline to the data
|
|
pipeline.fit(group_data)
|
|
|
|
# Transform the data using the fitted pipeline
|
|
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
|
|
|
|
# Get cluster labels
|
|
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
|
|
|
|
# Get centroids (already in the same transformed space)
|
|
centroids = pipeline.named_steps['kmeans'].cluster_centers_
|
|
|
|
# if the data isn't an array, make it one
|
|
if not isinstance(processed_data, np.ndarray):
|
|
processed_data = processed_data.toarray()
|
|
|
|
# Calculate distances from each point to the centroid of its cluster
|
|
distances_to_centroids = [
|
|
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
|
|
for i, label in enumerate(group_data['cluster'])
|
|
]
|
|
|
|
group_data['distance_to_centroid'] = distances_to_centroids
|
|
|
|
# Ranking rows by distance within each cluster
|
|
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
|
|
|
|
# Sorting to verify
|
|
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
|
|
group_data.reset_index(inplace=True)
|
|
|
|
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
|
|
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
|
|
final_clusters.append(to_append)
|
|
|
|
final_clusters = pd.concat(final_clusters)
|
|
# remap the clusters from the current names to 1 -> n_clusters
|
|
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
|
|
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
|
|
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
|
|
|
|
assigned_clusters = clustering_features_combined.merge(
|
|
final_clusters, how="left", on="internal_id"
|
|
)
|
|
|
|
assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
|
|
|
|
asset_list_with_archetypes = asset_list.merge(
|
|
assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
|
|
on="internal_id"
|
|
).merge(
|
|
master_sheet_clustering_features[["Address ID", "Property Type", "Walls", "Roofs", "Heating"]],
|
|
how="left",
|
|
on="Address ID"
|
|
)
|
|
|
|
# We populate the reasons for no archetype
|
|
# 1) If it's not a priority postcode
|
|
asset_list_with_archetypes["cluster"] = np.where(
|
|
~asset_list_with_archetypes["is_priority_postcode"],
|
|
"NOT PRIORITY POSTCODE",
|
|
asset_list_with_archetypes["cluster"]
|
|
)
|
|
|
|
# 2) If it's EPC C or above
|
|
asset_list_with_archetypes["cluster"] = np.where(
|
|
asset_list_with_archetypes["is_epc_c_or_above"],
|
|
"EPC C OR ABOVE",
|
|
asset_list_with_archetypes["cluster"]
|
|
)
|
|
|
|
# If it's in Wave 2.1
|
|
asset_list_with_archetypes["cluster"] = np.where(
|
|
asset_list_with_archetypes["In Osmosis Wave 2.1"],
|
|
"IN WAVE 2.1",
|
|
asset_list_with_archetypes["cluster"]
|
|
)
|
|
|
|
# Has missing uprn
|
|
asset_list_with_archetypes["cluster"] = np.where(
|
|
pd.isnull(asset_list_with_archetypes["uprn"]),
|
|
"MISSING UPRN",
|
|
asset_list_with_archetypes["cluster"]
|
|
)
|
|
|
|
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
|
|
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
|
|
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
|
|
|
|
asset_list_with_archetypes["archetype_representative"] = (
|
|
asset_list_with_archetypes["archetype_representative"].fillna(False)
|
|
)
|
|
|
|
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.1.csv", index=False)
|
|
|
|
# Produce the archetyping features
|
|
archetyping_features_csv = assigned_clusters[
|
|
[
|
|
"internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
|
|
"is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
|
|
]
|
|
].merge(
|
|
asset_list[
|
|
["internal_id", "uprn", "external_address_id"]
|
|
],
|
|
how="left",
|
|
on="internal_id"
|
|
).merge(
|
|
master_sheet_clustering_features,
|
|
how="left",
|
|
right_on="Address ID",
|
|
left_on="external_address_id"
|
|
).drop(columns=["Address ID"]).rename(
|
|
columns={
|
|
"internal_id": "Osm. ID",
|
|
"external_address_id": "Address ID",
|
|
}
|
|
)
|
|
|
|
archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
|
|
archetyping_features_csv.to_csv(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
|
|
)
|
|
|
|
representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
|
|
print(representatives["postal_region"].nunique())
|
|
print(representatives["county"].nunique())
|
|
|
|
|
|
def read_asset_list():
|
|
asset_list = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
|
header=4
|
|
)
|
|
|
|
udprn_data = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
|
|
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
|
|
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
|
|
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
|
|
|
|
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
|
|
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
|
|
|
|
asset_list = asset_list.rename(
|
|
columns={
|
|
"Osm. ID": "internal_id",
|
|
"Org. ref.": "customer_asset_id",
|
|
"Postcode": "postcode",
|
|
"House no": "house_number",
|
|
"Name": "address1",
|
|
"Address line 2": "address2",
|
|
"City/Town": "city_town",
|
|
"County": "county",
|
|
"Address ID": "external_address_id",
|
|
"Owning body": "owner"
|
|
}
|
|
)
|
|
|
|
asset_list["full_address"] = np.where(
|
|
~pd.isnull(asset_list["address2"]),
|
|
(
|
|
asset_list["address1"] + ", " +
|
|
asset_list["address2"] + ", " +
|
|
asset_list["city_town"].str.title() + ", " +
|
|
asset_list["postcode"]
|
|
),
|
|
asset_list["address1"] + ", " +
|
|
asset_list["city_town"].str.title() + ", " +
|
|
asset_list["postcode"]
|
|
)
|
|
return asset_list
|
|
|
|
|
|
def merge_uprn_to_asset_list(asset_list):
|
|
# Read in the lookups
|
|
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
|
|
)))
|
|
uprn_lookup_1["match_type"] = "Exact"
|
|
|
|
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
|
|
)))
|
|
uprn_lookup_2 = uprn_lookup_2.rename(
|
|
columns={
|
|
"epc_address": "standardised_address",
|
|
"epc_postcode": "standardised_postcode"
|
|
}
|
|
)
|
|
uprn_lookup_2["match_type"] = "EPC"
|
|
uprn_lookup_2["uprn"] = np.where(
|
|
uprn_lookup_2["internal_id"] == 1091,
|
|
83143766,
|
|
uprn_lookup_2["uprn"]
|
|
)
|
|
|
|
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
|
|
)))
|
|
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
|
|
concatenate_row, axis=1
|
|
)
|
|
uprn_lookup_3 = uprn_lookup_3[
|
|
["udprn", "uprn", "standardised_address", "postcode"]
|
|
].rename(columns={"postcode": "standardised_postcode"})
|
|
uprn_lookup_3["match_type"] = "Exact"
|
|
|
|
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
|
|
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
|
|
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
|
|
# prepare lookup 4
|
|
uprn_lookup_4 = []
|
|
for _, x in uprn_lookup_4_basis.iterrows():
|
|
|
|
property_type = None
|
|
built_form = None
|
|
if x["option"] == 1:
|
|
uprn = x["os_option_1_uprn"]
|
|
standardised_address = x["os_option_1_address"]
|
|
postcode = x["os_option_1_postcode"]
|
|
elif x["option"] == 2:
|
|
uprn = x["os_option_2_uprn"]
|
|
standardised_address = x["os_option_2_address"]
|
|
postcode = x["os_option_2_address"].split(", ")[-1]
|
|
else:
|
|
uprn = x["manual_uprn"]
|
|
standardised_address = x["manual_address"]
|
|
postcode = x["manual_postcode"]
|
|
|
|
uprn_lookup_4.append(
|
|
{
|
|
"internal_id": x["internal_id"],
|
|
"external_address_id": x["external_address_id"],
|
|
"uprn": uprn,
|
|
"standardised_address": standardised_address,
|
|
"standardised_postcode": postcode,
|
|
"property_type": property_type,
|
|
"built_form": built_form
|
|
}
|
|
)
|
|
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
|
|
uprn_lookup_4["match_type"] = "Fuzzy"
|
|
|
|
# concat
|
|
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
|
|
|
|
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
|
|
|
|
# Final preps of lookups
|
|
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
|
|
uprn_lookup_3 = uprn_lookup_3.merge(
|
|
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
|
|
)
|
|
uprn_lookup = pd.concat([
|
|
uprn_lookup,
|
|
uprn_lookup_3,
|
|
uprn_lookup_4
|
|
])
|
|
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
|
|
|
|
asset_list = asset_list.merge(
|
|
uprn_lookup.drop(columns=["udprn"]),
|
|
how="inner",
|
|
on=["internal_id", "external_address_id"]
|
|
)
|
|
|
|
return asset_list, uprn_lookup_2
|
|
|
|
|
|
def read_omosis_wave_2_1():
|
|
osmosis_wave_2_1 = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
|
|
header=4,
|
|
)
|
|
# Remove double spaces from "Name"
|
|
osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace(" ", " ")
|
|
|
|
osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
|
|
osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
|
|
# We produce a cleaned list of asset ids from osmosis_wave_2_1
|
|
osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
|
|
# We have some ids that are in the form 'id1, id2' so we split them
|
|
osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]
|
|
|
|
return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1
|
|
|
|
|
|
def read_stonewater_asset_data():
|
|
master_sheet = pd.read_csv(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
|
|
"sheet.csv",
|
|
encoding='latin1'
|
|
)
|
|
|
|
master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)
|
|
|
|
previous_waves = master_sheet[
|
|
(master_sheet["In Osmosis W2.1"] == "Yes") |
|
|
(master_sheet["In Wates Wave 2.1"] == "Yes") |
|
|
(master_sheet["In Liv Green Wave 2.1"] == "Yes") |
|
|
(master_sheet["In CCS Wave 2.1"] == "Yes")
|
|
].copy()
|
|
|
|
previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]
|
|
|
|
# We also read the priority postcodes
|
|
priority_postcodes = pd.read_csv(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
|
|
"postcodes.csv",
|
|
header=17
|
|
)
|
|
|
|
priority_postcodes = priority_postcodes["Postcode"].tolist()
|
|
|
|
return priority_postcodes, previous_waves_address_id, master_sheet
|
|
|
|
|
|
def read_epc_data(uprn_lookup_2):
|
|
epc_data = json.loads(
|
|
read_from_s3(
|
|
bucket_name="retrofit-data-dev",
|
|
s3_file_name="customers/Stonewater/clustering/epc_data.json"
|
|
)
|
|
)
|
|
epc_data = pd.DataFrame(epc_data)
|
|
|
|
epc_data["uprn"] = np.where(
|
|
epc_data["internal_id"] == 1091,
|
|
83143766,
|
|
epc_data["uprn"]
|
|
)
|
|
|
|
# We drop come EPCS
|
|
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
|
|
|
|
epc_data_batch_2 = read_pickle_from_s3(
|
|
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
|
|
bucket_name="retrofit-data-dev"
|
|
)
|
|
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
|
|
|
|
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
|
|
|
|
return complete_epcs
|