Merge pull request #323 from Hestia-Homes/solar-api

Solar api
This commit is contained in:
KhalimCK 2024-07-23 20:11:05 +01:00 committed by GitHub
commit 1dafedc733
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 894 additions and 101 deletions

View file

@ -133,7 +133,7 @@ def app():
energy_consumption_data = []
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
# Skip the first 50
if i < 127:
if i < 250:
continue
data = pd.read_csv(directory / "certificates.csv", low_memory=False)

View file

@ -1 +1 @@
[{"EPC": "D", "count": 1718}, {"EPC": "C", "count": 1343}, {"EPC": "E", "count": 538}, {"EPC": "F", "count": 80}, {"EPC": "B", "count": 52}, {"EPC": "G", "count": 3}, {"EPC": "A", "count": 2}]
[{"EPC": "D", "count": 332}, {"EPC": "C", "count": 68}, {"EPC": "E", "count": 44}, {"EPC": "F", "count": 6}]

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
[{"is_real_epc": true, "count": 3736}, {"is_real_epc": false, "count": 1509}]
[{"index": true, "is_real_epc": 3736}, {"index": false, "is_real_epc": 1509}]

View file

@ -31,7 +31,8 @@ def make_epc_rating_piechart(epc_rating_breakdown):
labels = [x["EPC"] for x in epc_rating_breakdown]
values = [x["count"] for x in epc_rating_breakdown]
marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
# marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
marker_colors = ["#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
fig = go.Figure(
data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors, sort=False)],
@ -53,7 +54,10 @@ def make_map(locations):
# Create custom hover text
df['hover_text'] = df.apply(
lambda row: f"UPRN: {int(row['uprn'])}<br>Address: {row['standardised_address']}<br>Postcode: "
f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: {row['LONGITUDE']}",
f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: "
f"{row['LONGITUDE']}<br>Walls: {row['Walls']}<br>Roofs: {row['Roofs']}<br>Main Fuel: "
f"{row['Main Fuel']}<br>Heating: {row['Heating']}<br>Age: {row['Age']}<br>Property Type: "
f"{row['Property Type']}",
axis=1)
data = [
@ -93,8 +97,8 @@ def layout():
locations = json.load(file)
# Get the EPC breakdown data
with open("Stonewater real EPC breakdown.json") as file:
real_epc_breakdown = json.load(file)
# with open("Stonewater real EPC breakdown.json") as file:
# real_epc_breakdown = json.load(file)
# Get the EPC ratings data
with open("Stonewater EPC rating breakdown.json") as file:
@ -149,7 +153,8 @@ def layout():
style={"font-size": "2.5rem", "font-weight": "bold", "margin-bottom": "20px"}
),
html.P(
"This map shows the location of the properties that are to be surveyed by Osmosis.",
"This map shows the location of the properties that are to be surveyed by Osmosis. "
"These properties span across 30 counties and 155 postal regions",
style={"font-size": "1.25rem", "margin-bottom": "40px"}
),
],
@ -170,22 +175,22 @@ def layout():
),
dbc.Row(
[
dbc.Col(
[
html.Div(
"Breakdown of real EPCs",
style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
className='text-center'
),
html.Div(
"This pie chart shows the proportion of real EPCs in the asset list. Currently, "
"there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
style={"marginBottom": "1em"}
),
make_real_epc_piechart(real_epc_breakdown),
],
width={"size": 5},
),
# dbc.Col(
# [
# html.Div(
# "Breakdown of real EPCs",
# style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
# className='text-center'
# ),
# html.Div(
# "This pie chart shows the proportion of real EPCs in the asset list. Currently, "
# "there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
# style={"marginBottom": "1em"}
# ),
# make_real_epc_piechart(real_epc_breakdown),
# ],
# width={"size": 5},
# ),
dbc.Col(
[
html.Div(
@ -195,22 +200,9 @@ def layout():
),
html.Div(
[
"This pie chart shows the breakdown of EPC ratings, for properties that currently "
"have an EPC. "
"The ratings range from A to G, where surprisingly, there are two EPC properties "
"that were initially "
"expected by Parity's modelled SAP, to be EPC D or below. These properties can be"
" seen ",
html.A("here",
href="https://find-energy-certificate.service.gov.uk/energy-certificate"
"/2708-5001-7327-6090-7284",
target="_blank"),
" and ",
html.A("here",
href="https://find-energy-certificate.service.gov.uk/energy-certificate"
"/1037-4032-1009-0361-7292",
target="_blank"),
"."
"This pie chart shows the breakdown of expected and real EPC ratings, "
"for properties "
"that have been selected for sample",
],
style={"marginBottom": "1em"}
),

View file

@ -11,8 +11,9 @@ In this script, we do the following things:
import pandas as pd
import json
from utils.s3 import read_pickle_from_s3
from backend.app.utils import sap_to_epc
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V3.csv")
archetyped_asset_list = stonewater_asset_list[
[
"internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
@ -25,28 +26,15 @@ archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
# Read in and merge on clustering features
clustering_features = read_pickle_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
clustering_features = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv",
)
# Move property-type and built-form to the first two columns
columns_to_move = ['property-type', 'built-form']
# Get the remaining columns
remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
# Create the new column order
new_column_order = columns_to_move + remaining_columns
# Reorder the DataFrame
clustering_features = clustering_features[new_column_order]
archetyped_asset_list = archetyped_asset_list.merge(
clustering_features,
on="internal_id",
how="inner"
)
clustering_features.drop(columns=['uprn', 'Address ID', "rank", "cluster", "archetype_representative"]),
left_on="internal_id",
right_on="Osm. ID"
).drop(columns=["Osm. ID"])
archetyped_asset_list = archetyped_asset_list.rename(
columns={
@ -82,12 +70,47 @@ archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
# We store the location data, which will be used for the mapping. We just need the longitude and latitude
stonewater_asset_list["uprn"] = stonewater_asset_list["uprn"].astype('Int64')
mapping_data = stonewater_asset_list[
stonewater_asset_list["archetype_representative"]
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]].merge(
archetyped_asset_list[["uprn", "Walls", "Roofs", "Main Fuel", "Heating", "Age", "Property Type"]],
how="left",
on="uprn"
)
# We need to merge on longitude and latitude
spatial_data_to_uprn = read_pickle_from_s3(
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
bucket_name="retrofit-data-dev"
)
# Function to convert specific columns to bool dtype
def convert_specific_columns_to_bool(df, columns):
for column in columns:
if column in df.columns:
df[column] = df[column].astype(bool)
return df
spatial_data_to_uprn = [convert_specific_columns_to_bool(
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
) for df in spatial_data_to_uprn]
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
spatial_data_to_uprn = spatial_data_to_uprn.drop(
columns=["partition", "filename"]
).rename(columns={"UPRN": "uprn"})
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str).astype("Int64")
mapping_data = mapping_data.merge(
clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
spatial_data_to_uprn[
["uprn", "LONGITUDE", "LATITUDE", "conservation_status", "is_listed_building", "is_heritage_building"]
],
how="left",
on="uprn"
)
mapping_data = mapping_data.drop(columns=["internal_id"])
@ -95,38 +118,28 @@ with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w")
f.write(json.dumps(mapping_data.to_dict(orient="records")))
# We also include some data for visualising the breakdown of EPCS
proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
# Invert the true and false
proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
proportion_of_real_epcs = proportion_of_real_epcs.rename(
columns={"estimated": "is_real_epc"}
)
# proportion_of_real_epcs = (~clustering_features["estimated"]).value_counts().to_frame().reset_index()
# proportion_of_real_epcs = proportion_of_real_epcs.rename(
# columns={"estimated": "is_real_epc"}
# )
#
# with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
# f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
# Produce the breakdown of EPC ratings for properties to be surveyed
clustering_features["representative_epc"] = clustering_features["representative_sap"].apply(sap_to_epc)
# Produce the breakdown of EPC ratings
epc_rating_breakdown = (
clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
clustering_features[clustering_features["archetype_representative"]]["representative_epc"]
.value_counts()
.to_frame()
.reset_index()
)
epc_rating_breakdown = epc_rating_breakdown.rename(
columns={"current-energy-rating": "EPC"}
columns={"index": "EPC", "representative_epc": "count"}
)
with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
epc_a_properties = clustering_features[
(clustering_features["current-energy-rating"] == "A")
& (~clustering_features["estimated"])
]
epc_a_properties = epc_a_properties.merge(
stonewater_asset_list,
on="internal_id",
how="inner"
)

View file

@ -13,12 +13,13 @@ import numpy as np
import pandas as pd
import time
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
save_dataframe_to_s3_parquet, save_pickle_to_s3
save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances_argmin_min
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -1083,11 +1084,11 @@ def compile_data():
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
# TODO: Let's store this in s3
save_data_to_s3(
data=json.dumps(spatial_data_to_uprn.to_dict("records")),
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
bucket_name="retrofit-data-dev"
)
# save_data_to_s3(
# data=json.dumps(spatial_data_to_uprn.to_dict("records")),
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
# bucket_name="retrofit-data-dev"
# )
# We merge this spatial data onto final EPCS
@ -1429,17 +1430,17 @@ def compile_data_final():
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
# Store in S3
# TODO - read in instead of running
save_pickle_to_s3(
data=epc_data_batch_2,
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
save_pickle_to_s3(
data=older_epcs_batch_2,
s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
# save_pickle_to_s3(
# data=epc_data_batch_2,
# s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
# bucket_name="retrofit-data-dev"
# )
#
# save_pickle_to_s3(
# data=older_epcs_batch_2,
# s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
# bucket_name="retrofit-data-dev"
# )
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
@ -1799,6 +1800,10 @@ def compile_data_final():
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
]
additional_features = [
]
# Define the preprocessing for numerical and categorical features
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
@ -1957,3 +1962,706 @@ def pull_ideal_postcodes(missing_uprn_with_udprn):
result["result"]
)
completed_id += 1
def updated_version():
"""
This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
using fewer variables and also factoring in their internal data sources
This work began on the 23rd July 2024
:return:
"""
########################################################################
# Read in data
########################################################################
asset_list = read_asset_list()
asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list)
# Read in the properties that have been included in Osmosis' wave 2.1
osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)
# We also check the address & postcode
asset_list["In Osmosis Wave 2.1"] = np.where(
asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
True,
asset_list["In Osmosis Wave 2.1"]
)
priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()
# Pull in the EPC data
epc_data = read_epc_data(uprn_lookup_2)
# Pull in the spatial data to UPRN
spatial_data_to_uprn = read_pickle_from_s3(
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
bucket_name="retrofit-data-dev"
)
# Function to convert specific columns to bool dtype
def convert_specific_columns_to_bool(df, columns):
for column in columns:
if column in df.columns:
df[column] = df[column].astype(bool)
return df
spatial_data_to_uprn = [convert_specific_columns_to_bool(
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
) for df in spatial_data_to_uprn]
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
spatial_data_to_uprn = spatial_data_to_uprn.drop(
columns=["partition", "filename"]
).rename(columns={"UPRN": "uprn"})
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
########################################################################
# Prepare the data
########################################################################
# Filter the asset list down to the priority postcodes
asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)
master_sheet = master_sheet[
master_sheet["Address ID"].isin(
asset_list["external_address_id"].values
)
]
master_sheet["days_since_lodgement"] = (
datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
).dt.days
asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
how="left",
left_on="external_address_id",
right_on="Address ID"
)
asset_list = asset_list.merge(
epc_data[["internal_id", "current-energy-efficiency", "lodgement-date", "estimated"]],
how="left",
on="internal_id"
)
asset_list["days_since_lodgement_epc"] = (
datetime.now() - pd.to_datetime(asset_list["lodgement-date"], errors="coerce", dayfirst=True)
).dt.days
# Flag properties that were surveyed within the last 5 years
asset_list["epc_within_5_years"] = asset_list["days_since_lodgement_epc"] < 5 * 365
# Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
# a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
# the EPC is done
asset_list["is_epc_c_or_above"] = (
((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
(asset_list["EPC Rating"] >= 80)
)
clustering_features = asset_list[
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
~pd.isnull(asset_list["uprn"])
][
[
"internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
"city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
"epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
]
]
# Merge on the SAP data
clustering_features = clustering_features.merge(
master_sheet[
["Address ID", "SAP"]
].rename(columns={"SAP": "parity_modelled_sap"}),
how="left",
left_on="external_address_id",
right_on="Address ID"
)
# For SAP, we use the most recent EPC if epc_within_5_years is True, otherwise we use the parity modelled sap
clustering_features["current-energy-efficiency"] = clustering_features["current-energy-efficiency"].astype(float)
clustering_features["representative_sap"] = np.where(
clustering_features["epc_within_5_years"],
clustering_features["current-energy-efficiency"],
clustering_features["parity_modelled_sap"]
)
# We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
# is too many
clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
# Merge on spatial features
clustering_features = clustering_features.merge(
spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
how="left",
on="uprn"
)
# incorect_epcs = clustering_features[
# clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
# incorect_epcs = incorect_epcs[
# ~pd.isnull(incorect_epcs["current-energy-efficiency"]) & pd.isnull(incorect_epcs["estimated"])
# ]
# incorect_epcs = incorect_epcs.rename(columns={"current-energy-efficiency": "Current SAP Rating"})
# # Store data
# incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
# We add in the key features, which are used for clustering
master_sheet_clustering_features = master_sheet[
["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
].copy()
# Step 1: Remap walls - we end up with 11 types
master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
{
"TimberFrame: AsBuilt": "Other wall type, as built",
"SystemBuilt: AsBuilt": "Other wall type, as built",
"Sandstone: AsBuilt": "Other wall type, as built",
"Sandstone: Internal": "Other wall type, internal or external",
"SystemBuilt: External": "Other wall type, internal or external",
"GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
"TimberFrame: Internal": "Other wall type, internal or external",
"Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
"SystemBuilt: Internal": "Other wall type, internal or external",
"Cavity: Internal": "Other wall type, internal or external",
}
)
# Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
# gives us the insulation thickness
# Clean an incorrect value
master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
{
"PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
"PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
}
)
master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
master_sheet_clustering_features['Roofs'].apply(
lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
)
)
# Strip any extra whitespace
master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
master_sheet_clustering_features['roof_insulation_thickness'] = (
master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
)
def map_thickness(thickness):
try:
value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
return "Above 250mm" if value > 250 else "Below 250mm"
except ValueError:
return thickness # Return the original value if it cannot be converted to a float
master_sheet_clustering_features['roof_insulation_category'] = (
master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
)
# Ideas
# 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
# as a secondary category
# 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
# (could split on :, take first part)
clustering_features = clustering_features.merge(
master_sheet_clustering_features,
how="left",
on="Address ID"
)
# Reduce down to the final set of features we need
clustering_features = clustering_features[
[
"internal_id",
"Property Type",
# Location
"postal_region",
'conservation_status',
'is_listed_building',
'is_heritage_building',
"county",
# Walls
"walls_reduced",
# Roof
"roof_type",
"roof_insulation_category",
# Heating
"Heating",
# Fuel
"Main Fuel",
"Age",
"Total Floor Area",
"representative_sap",
"days_since_lodgement",
]
]
clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
def split_property_type(row):
parts = row.split(':')
property_type = parts[0].strip()
built_form = parts[1].strip() if len(parts) > 1 else ''
property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
return pd.Series([property_type, built_form, property_extended_feature])
clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
clustering_features['Property Type'].apply(split_property_type)
)
clustering_features = clustering_features.drop(columns=["Property Type"])
# These are the variables we MUST split by
grouping_columns = [
"property_type",
"walls_reduced",
"roof_type",
"Main Fuel",
"county",
]
def combine_small_groups(clustering_features, grouping_columns, threshold=2):
# Identify small groups
group_sizes = clustering_features.groupby(grouping_columns).size()
small_groups = group_sizes[group_sizes <= threshold].index.tolist()
# Remove small groups from the original clustering_features
small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
clustering_features_ok = clustering_features[
~clustering_features.set_index(grouping_columns).index.isin(small_groups)
]
if small_group_data.empty:
return clustering_features
# One-Hot Encode categorical variables
categorical_features = (
clustering_features_ok.drop(columns=["internal_id"])
.select_dtypes(include=['object', 'category']).columns.tolist()
)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(clustering_features_ok[categorical_features])
# Combine small groups with the nearest available group
small_group_ohe = ohe.transform(small_group_data[categorical_features])
large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])
numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
small_group_numerical = small_group_data[numerical_features].values
large_group_numerical = clustering_features_ok[numerical_features].values
# Concatenate one-hot encoded categorical and numerical features
small_group_features = np.hstack([small_group_ohe, small_group_numerical])
large_group_features = np.hstack([large_group_ohe, large_group_numerical])
# Calculate distances and find nearest groups
closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
closest_group_index = clustering_features_ok.iloc[closest_groups].index
# Update small groups to the nearest large group
for small_group, closest_group in zip(small_groups, closest_group_index):
small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
closest_group, grouping_columns].values
combined_data = pd.concat([clustering_features_ok, small_group_data])
return combined_data
clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
########################################################################
# Clustering
########################################################################
numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
for col in categorical_features:
clustering_features_combined[col] = clustering_features_combined[col].astype(str)
id_column = 'internal_id'
n_clusters = 450
random_state = 0
training_data_grouped = clustering_features_combined.groupby(grouping_columns)
group_sizes = {name: len(group) for name, group in training_data_grouped}
total_size = sum(group_sizes.values())
cluster_allocation = {
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
}
# Adjust cluster allocation to ensure total clusters sum to 450
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
final_clusters = []
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
group_n_clusters = cluster_allocation[group_variables]
group_data.set_index(id_column, inplace=True)
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)
]
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
# Fit the pipeline to the data
pipeline.fit(group_data)
# Transform the data using the fitted pipeline
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
# Get cluster labels
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
# Get centroids (already in the same transformed space)
centroids = pipeline.named_steps['kmeans'].cluster_centers_
# if the data isn't an array, make it one
if not isinstance(processed_data, np.ndarray):
processed_data = processed_data.toarray()
# Calculate distances from each point to the centroid of its cluster
distances_to_centroids = [
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
for i, label in enumerate(group_data['cluster'])
]
group_data['distance_to_centroid'] = distances_to_centroids
# Ranking rows by distance within each cluster
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
# Sorting to verify
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
group_data.reset_index(inplace=True)
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
final_clusters.append(to_append)
final_clusters = pd.concat(final_clusters)
# remap the clusters from the current names to 1 -> n_clusters
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
assigned_clusters = clustering_features_combined.merge(
final_clusters, how="left", on="internal_id"
)
assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
asset_list_with_archetypes = asset_list.merge(
assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
on="internal_id"
)
# We populate the reasons for no archetype
# 1) If it's not a priority postcode
asset_list_with_archetypes["cluster"] = np.where(
~asset_list_with_archetypes["is_priority_postcode"],
"NOT PRIORITY POSTCODE",
asset_list_with_archetypes["cluster"]
)
# 2) If it's EPC C or above
asset_list_with_archetypes["cluster"] = np.where(
asset_list_with_archetypes["is_epc_c_or_above"],
"EPC C OR ABOVE",
asset_list_with_archetypes["cluster"]
)
# If it's in Wave 2.1
asset_list_with_archetypes["cluster"] = np.where(
asset_list_with_archetypes["In Osmosis Wave 2.1"],
"IN WAVE 2.1",
asset_list_with_archetypes["cluster"]
)
# Has missing uprn
asset_list_with_archetypes["cluster"] = np.where(
pd.isnull(asset_list_with_archetypes["uprn"]),
"MISSING UPRN",
asset_list_with_archetypes["cluster"]
)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
asset_list_with_archetypes["archetype_representative"] = (
asset_list_with_archetypes["archetype_representative"].fillna(False)
)
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.csv", index=False)
# Produce the archetyping features
archetyping_features_csv = assigned_clusters[
[
"internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
"is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
]
].merge(
asset_list[
["internal_id", "uprn", "external_address_id"]
],
how="left",
on="internal_id"
).merge(
master_sheet_clustering_features,
how="left",
right_on="Address ID",
left_on="external_address_id"
).drop(columns=["Address ID"]).rename(
columns={
"internal_id": "Osm. ID",
"external_address_id": "Address ID",
}
)
archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
archetyping_features_csv.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
)
representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
print(representatives["postal_region"].nunique())
print(representatives["county"].nunique())
def read_asset_list():
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
asset_list = asset_list.rename(
columns={
"Osm. ID": "internal_id",
"Org. ref.": "customer_asset_id",
"Postcode": "postcode",
"House no": "house_number",
"Name": "address1",
"Address line 2": "address2",
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
"Owning body": "owner"
}
)
asset_list["full_address"] = np.where(
~pd.isnull(asset_list["address2"]),
(
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"].str.title() + ", " +
asset_list["postcode"]
),
asset_list["address1"] + ", " +
asset_list["city_town"].str.title() + ", " +
asset_list["postcode"]
)
return asset_list
def merge_uprn_to_asset_list(asset_list):
# Read in the lookups
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
)))
uprn_lookup_1["match_type"] = "Exact"
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
)))
uprn_lookup_2 = uprn_lookup_2.rename(
columns={
"epc_address": "standardised_address",
"epc_postcode": "standardised_postcode"
}
)
uprn_lookup_2["match_type"] = "EPC"
uprn_lookup_2["uprn"] = np.where(
uprn_lookup_2["internal_id"] == 1091,
83143766,
uprn_lookup_2["uprn"]
)
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
)))
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
concatenate_row, axis=1
)
uprn_lookup_3 = uprn_lookup_3[
["udprn", "uprn", "standardised_address", "postcode"]
].rename(columns={"postcode": "standardised_postcode"})
uprn_lookup_3["match_type"] = "Exact"
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
# prepare lookup 4
uprn_lookup_4 = []
for _, x in uprn_lookup_4_basis.iterrows():
property_type = None
built_form = None
if x["option"] == 1:
uprn = x["os_option_1_uprn"]
standardised_address = x["os_option_1_address"]
postcode = x["os_option_1_postcode"]
elif x["option"] == 2:
uprn = x["os_option_2_uprn"]
standardised_address = x["os_option_2_address"]
postcode = x["os_option_2_address"].split(", ")[-1]
else:
uprn = x["manual_uprn"]
standardised_address = x["manual_address"]
postcode = x["manual_postcode"]
uprn_lookup_4.append(
{
"internal_id": x["internal_id"],
"external_address_id": x["external_address_id"],
"uprn": uprn,
"standardised_address": standardised_address,
"standardised_postcode": postcode,
"property_type": property_type,
"built_form": built_form
}
)
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
uprn_lookup_4["match_type"] = "Fuzzy"
# concat
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
# Final preps of lookups
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
uprn_lookup_3 = uprn_lookup_3.merge(
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
)
uprn_lookup = pd.concat([
uprn_lookup,
uprn_lookup_3,
uprn_lookup_4
])
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
asset_list = asset_list.merge(
uprn_lookup.drop(columns=["udprn"]),
how="inner",
on=["internal_id", "external_address_id"]
)
return asset_list, uprn_lookup_2
def read_omosis_wave_2_1():
osmosis_wave_2_1 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
header=4,
)
# Remove double spaces from "Name"
osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace(" ", " ")
osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
# We produce a cleaned list of asset ids from osmosis_wave_2_1
osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
# We have some ids that are in the form 'id1, id2' so we split them
osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]
return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1
def read_stonewater_asset_data():
master_sheet = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
"sheet.csv",
encoding='latin1'
)
master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)
previous_waves = master_sheet[
(master_sheet["In Osmosis W2.1"] == "Yes") |
(master_sheet["In Wates Wave 2.1"] == "Yes") |
(master_sheet["In Liv Green Wave 2.1"] == "Yes") |
(master_sheet["In CCS Wave 2.1"] == "Yes")
].copy()
previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]
# We also read the priority postcodes
priority_postcodes = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
"postcodes.csv",
header=17
)
priority_postcodes = priority_postcodes["Postcode"].tolist()
return priority_postcodes, previous_waves_address_id, master_sheet
def read_epc_data(uprn_lookup_2):
epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/epc_data.json"
)
)
epc_data = pd.DataFrame(epc_data)
epc_data["uprn"] = np.where(
epc_data["internal_id"] == 1091,
83143766,
epc_data["uprn"]
)
# We drop come EPCS
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
epc_data_batch_2 = read_pickle_from_s3(
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
return complete_epcs

View file

@ -0,0 +1,80 @@
import os
from tqdm import tqdm
import pandas as pd
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from backend.app.utils import sap_to_epc
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
def app():
"""
This script will retrieve EPC data, for postcodes and produce statistics on the SAP Score
:return:
"""
source_file = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Addresses - SFR rents.xlsx")
source_file["row_id"] = source_file.index
# Split out the town, which is the final portion of the string, separated by commas
source_file["Town"] = source_file["Address"].apply(lambda x: x.split(" ")[-1].strip() if not pd.isnull(x) else None)
source_file["Address"] = source_file["Address"].apply(
lambda x: " ".join(x.split(" ")[:-1]).strip() if not pd.isnull(x) else None
)
unique_postcodes = source_file[["Address", "Postcode"]].drop_duplicates()
# for each postcode, pull EPC data
collected_data = []
no_data_found = []
no_data_after_filters = []
for _, config in tqdm(unique_postcodes.iterrows(), total=len(unique_postcodes)):
address1 = config["Address"] if not pd.isnull(config["Address"]) else ""
searcher = SearchEpc(
postcode=config["Postcode"],
address1=address1,
auth_token=EPC_AUTH_TOKEN,
os_api_key=""
)
while True:
params = {
"postcode": config["Postcode"],
"address": address1,
}
results = searcher.client.domestic.search(params=params, size=10000)
if not results:
# We strip back address1
address1 = " ".join(address1.split(" ")[:-1])
if not address1:
break
else:
break
if not results:
no_data_found.append(config)
continue
data = pd.DataFrame(results["rows"])
data["current-energy-efficiency"] = data["current-energy-efficiency"].astype(int)
# Take EPCs post 2023
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], errors="coerce")
data = data[data["lodgement-date"] >= "2023-01-01"]
# Take private nrentals
data = data[data["tenure"].isin(["rental (private)", "Rented (private)"])]
if data.empty:
no_data_after_filters.append(config)
continue
agg = data.groupby(["property-type", "built-form"])["current-energy-efficiency"].mean().reset_index()
agg = agg.rename(columns={"current-energy-efficiency": "Average SAP"})
agg["Average EPC"] = agg["Average SAP"].apply(sap_to_epc)
agg.insert(0, "Postcode", config["Postcode"])
agg.insert(0, "Address", address1)
collected_data.append(agg)
collected_df = pd.concat(collected_data)
collected_df.to_csv("EPC Averages SFR.csv", index=False)