mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
commit
1dafedc733
8 changed files with 894 additions and 101 deletions
|
|
@ -133,7 +133,7 @@ def app():
|
|||
energy_consumption_data = []
|
||||
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
|
||||
# Skip the first 50
|
||||
if i < 127:
|
||||
if i < 250:
|
||||
continue
|
||||
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
[{"EPC": "D", "count": 1718}, {"EPC": "C", "count": 1343}, {"EPC": "E", "count": 538}, {"EPC": "F", "count": 80}, {"EPC": "B", "count": 52}, {"EPC": "G", "count": 3}, {"EPC": "A", "count": 2}]
|
||||
[{"EPC": "D", "count": 332}, {"EPC": "C", "count": 68}, {"EPC": "E", "count": 44}, {"EPC": "F", "count": 6}]
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -1 +1 @@
|
|||
[{"is_real_epc": true, "count": 3736}, {"is_real_epc": false, "count": 1509}]
|
||||
[{"index": true, "is_real_epc": 3736}, {"index": false, "is_real_epc": 1509}]
|
||||
|
|
@ -31,7 +31,8 @@ def make_epc_rating_piechart(epc_rating_breakdown):
|
|||
labels = [x["EPC"] for x in epc_rating_breakdown]
|
||||
values = [x["count"] for x in epc_rating_breakdown]
|
||||
|
||||
marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
|
||||
# marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
|
||||
marker_colors = ["#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
|
||||
|
||||
fig = go.Figure(
|
||||
data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors, sort=False)],
|
||||
|
|
@ -53,7 +54,10 @@ def make_map(locations):
|
|||
# Create custom hover text
|
||||
df['hover_text'] = df.apply(
|
||||
lambda row: f"UPRN: {int(row['uprn'])}<br>Address: {row['standardised_address']}<br>Postcode: "
|
||||
f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: {row['LONGITUDE']}",
|
||||
f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: "
|
||||
f"{row['LONGITUDE']}<br>Walls: {row['Walls']}<br>Roofs: {row['Roofs']}<br>Main Fuel: "
|
||||
f"{row['Main Fuel']}<br>Heating: {row['Heating']}<br>Age: {row['Age']}<br>Property Type: "
|
||||
f"{row['Property Type']}",
|
||||
axis=1)
|
||||
|
||||
data = [
|
||||
|
|
@ -93,8 +97,8 @@ def layout():
|
|||
locations = json.load(file)
|
||||
|
||||
# Get the EPC breakdown data
|
||||
with open("Stonewater real EPC breakdown.json") as file:
|
||||
real_epc_breakdown = json.load(file)
|
||||
# with open("Stonewater real EPC breakdown.json") as file:
|
||||
# real_epc_breakdown = json.load(file)
|
||||
|
||||
# Get the EPC ratings data
|
||||
with open("Stonewater EPC rating breakdown.json") as file:
|
||||
|
|
@ -149,7 +153,8 @@ def layout():
|
|||
style={"font-size": "2.5rem", "font-weight": "bold", "margin-bottom": "20px"}
|
||||
),
|
||||
html.P(
|
||||
"This map shows the location of the properties that are to be surveyed by Osmosis.",
|
||||
"This map shows the location of the properties that are to be surveyed by Osmosis. "
|
||||
"These properties span across 30 counties and 155 postal regions",
|
||||
style={"font-size": "1.25rem", "margin-bottom": "40px"}
|
||||
),
|
||||
],
|
||||
|
|
@ -170,22 +175,22 @@ def layout():
|
|||
),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
html.Div(
|
||||
"Breakdown of real EPCs",
|
||||
style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
|
||||
className='text-center'
|
||||
),
|
||||
html.Div(
|
||||
"This pie chart shows the proportion of real EPCs in the asset list. Currently, "
|
||||
"there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
|
||||
style={"marginBottom": "1em"}
|
||||
),
|
||||
make_real_epc_piechart(real_epc_breakdown),
|
||||
],
|
||||
width={"size": 5},
|
||||
),
|
||||
# dbc.Col(
|
||||
# [
|
||||
# html.Div(
|
||||
# "Breakdown of real EPCs",
|
||||
# style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
|
||||
# className='text-center'
|
||||
# ),
|
||||
# html.Div(
|
||||
# "This pie chart shows the proportion of real EPCs in the asset list. Currently, "
|
||||
# "there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
|
||||
# style={"marginBottom": "1em"}
|
||||
# ),
|
||||
# make_real_epc_piechart(real_epc_breakdown),
|
||||
# ],
|
||||
# width={"size": 5},
|
||||
# ),
|
||||
dbc.Col(
|
||||
[
|
||||
html.Div(
|
||||
|
|
@ -195,22 +200,9 @@ def layout():
|
|||
),
|
||||
html.Div(
|
||||
[
|
||||
"This pie chart shows the breakdown of EPC ratings, for properties that currently "
|
||||
"have an EPC. "
|
||||
"The ratings range from A to G, where surprisingly, there are two EPC properties "
|
||||
"that were initially "
|
||||
"expected by Parity's modelled SAP, to be EPC D or below. These properties can be"
|
||||
" seen ",
|
||||
html.A("here",
|
||||
href="https://find-energy-certificate.service.gov.uk/energy-certificate"
|
||||
"/2708-5001-7327-6090-7284",
|
||||
target="_blank"),
|
||||
" and ",
|
||||
html.A("here",
|
||||
href="https://find-energy-certificate.service.gov.uk/energy-certificate"
|
||||
"/1037-4032-1009-0361-7292",
|
||||
target="_blank"),
|
||||
"."
|
||||
"This pie chart shows the breakdown of expected and real EPC ratings, "
|
||||
"for properties "
|
||||
"that have been selected for sample",
|
||||
],
|
||||
style={"marginBottom": "1em"}
|
||||
),
|
||||
|
|
|
|||
|
|
@ -11,8 +11,9 @@ In this script, we do the following things:
|
|||
import pandas as pd
|
||||
import json
|
||||
from utils.s3 import read_pickle_from_s3
|
||||
from backend.app.utils import sap_to_epc
|
||||
|
||||
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
|
||||
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V3.csv")
|
||||
archetyped_asset_list = stonewater_asset_list[
|
||||
[
|
||||
"internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
|
||||
|
|
@ -25,28 +26,15 @@ archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
|
|||
archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
|
||||
|
||||
# Read in and merge on clustering features
|
||||
clustering_features = read_pickle_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
||||
clustering_features = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv",
|
||||
)
|
||||
|
||||
# Move property-type and built-form to the first two columns
|
||||
columns_to_move = ['property-type', 'built-form']
|
||||
|
||||
# Get the remaining columns
|
||||
remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
|
||||
|
||||
# Create the new column order
|
||||
new_column_order = columns_to_move + remaining_columns
|
||||
|
||||
# Reorder the DataFrame
|
||||
clustering_features = clustering_features[new_column_order]
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.merge(
|
||||
clustering_features,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
clustering_features.drop(columns=['uprn', 'Address ID', "rank", "cluster", "archetype_representative"]),
|
||||
left_on="internal_id",
|
||||
right_on="Osm. ID"
|
||||
).drop(columns=["Osm. ID"])
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.rename(
|
||||
columns={
|
||||
|
|
@ -82,12 +70,47 @@ archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
|
|||
# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
|
||||
|
||||
# We store the location data, which will be used for the mapping. We just need the longitude and latitude
|
||||
stonewater_asset_list["uprn"] = stonewater_asset_list["uprn"].astype('Int64')
|
||||
|
||||
mapping_data = stonewater_asset_list[
|
||||
stonewater_asset_list["archetype_representative"]
|
||||
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
|
||||
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]].merge(
|
||||
archetyped_asset_list[["uprn", "Walls", "Roofs", "Main Fuel", "Heating", "Age", "Property Type"]],
|
||||
how="left",
|
||||
on="uprn"
|
||||
)
|
||||
|
||||
# We need to merge on longitude and latitude
|
||||
spatial_data_to_uprn = read_pickle_from_s3(
|
||||
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
|
||||
# Function to convert specific columns to bool dtype
|
||||
def convert_specific_columns_to_bool(df, columns):
|
||||
for column in columns:
|
||||
if column in df.columns:
|
||||
df[column] = df[column].astype(bool)
|
||||
return df
|
||||
|
||||
|
||||
spatial_data_to_uprn = [convert_specific_columns_to_bool(
|
||||
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
|
||||
) for df in spatial_data_to_uprn]
|
||||
|
||||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
spatial_data_to_uprn = spatial_data_to_uprn.drop(
|
||||
columns=["partition", "filename"]
|
||||
).rename(columns={"UPRN": "uprn"})
|
||||
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str).astype("Int64")
|
||||
|
||||
mapping_data = mapping_data.merge(
|
||||
clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
|
||||
spatial_data_to_uprn[
|
||||
["uprn", "LONGITUDE", "LATITUDE", "conservation_status", "is_listed_building", "is_heritage_building"]
|
||||
],
|
||||
how="left",
|
||||
on="uprn"
|
||||
)
|
||||
mapping_data = mapping_data.drop(columns=["internal_id"])
|
||||
|
||||
|
|
@ -95,38 +118,28 @@ with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w")
|
|||
f.write(json.dumps(mapping_data.to_dict(orient="records")))
|
||||
|
||||
# We also include some data for visualising the breakdown of EPCS
|
||||
proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
|
||||
# Invert the true and false
|
||||
proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
|
||||
proportion_of_real_epcs = proportion_of_real_epcs.rename(
|
||||
columns={"estimated": "is_real_epc"}
|
||||
)
|
||||
# proportion_of_real_epcs = (~clustering_features["estimated"]).value_counts().to_frame().reset_index()
|
||||
# proportion_of_real_epcs = proportion_of_real_epcs.rename(
|
||||
# columns={"estimated": "is_real_epc"}
|
||||
# )
|
||||
#
|
||||
# with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
|
||||
# f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
|
||||
f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
|
||||
# Produce the breakdown of EPC ratings for properties to be surveyed
|
||||
clustering_features["representative_epc"] = clustering_features["representative_sap"].apply(sap_to_epc)
|
||||
|
||||
# Produce the breakdown of EPC ratings
|
||||
epc_rating_breakdown = (
|
||||
clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
|
||||
clustering_features[clustering_features["archetype_representative"]]["representative_epc"]
|
||||
.value_counts()
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
epc_rating_breakdown = epc_rating_breakdown.rename(
|
||||
columns={"current-energy-rating": "EPC"}
|
||||
columns={"index": "EPC", "representative_epc": "count"}
|
||||
)
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
|
||||
f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
|
||||
|
||||
epc_a_properties = clustering_features[
|
||||
(clustering_features["current-energy-rating"] == "A")
|
||||
& (~clustering_features["estimated"])
|
||||
]
|
||||
|
||||
epc_a_properties = epc_a_properties.merge(
|
||||
stonewater_asset_list,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -13,12 +13,13 @@ import numpy as np
|
|||
import pandas as pd
|
||||
import time
|
||||
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
|
||||
save_dataframe_to_s3_parquet, save_pickle_to_s3
|
||||
save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from scipy.spatial.distance import cdist
|
||||
from sklearn.metrics import pairwise_distances_argmin_min
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
|
@ -1083,11 +1084,11 @@ def compile_data():
|
|||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
|
||||
# TODO: Let's store this in s3
|
||||
save_data_to_s3(
|
||||
data=json.dumps(spatial_data_to_uprn.to_dict("records")),
|
||||
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(spatial_data_to_uprn.to_dict("records")),
|
||||
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
# We merge this spatial data onto final EPCS
|
||||
|
||||
|
|
@ -1429,17 +1430,17 @@ def compile_data_final():
|
|||
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
|
||||
# Store in S3
|
||||
# TODO - read in instead of running
|
||||
save_pickle_to_s3(
|
||||
data=epc_data_batch_2,
|
||||
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
save_pickle_to_s3(
|
||||
data=older_epcs_batch_2,
|
||||
s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
# save_pickle_to_s3(
|
||||
# data=epc_data_batch_2,
|
||||
# s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
#
|
||||
# save_pickle_to_s3(
|
||||
# data=older_epcs_batch_2,
|
||||
# s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
|
||||
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
|
||||
|
|
@ -1799,6 +1800,10 @@ def compile_data_final():
|
|||
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
|
||||
]
|
||||
|
||||
additional_features = [
|
||||
|
||||
]
|
||||
|
||||
# Define the preprocessing for numerical and categorical features
|
||||
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
|
|
@ -1957,3 +1962,706 @@ def pull_ideal_postcodes(missing_uprn_with_udprn):
|
|||
result["result"]
|
||||
)
|
||||
completed_id += 1
|
||||
|
||||
|
||||
def updated_version():
|
||||
"""
|
||||
This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
|
||||
using fewer variables and also factoring in their internal data sources
|
||||
|
||||
This work began on the 23rd July 2024
|
||||
:return:
|
||||
"""
|
||||
|
||||
########################################################################
|
||||
# Read in data
|
||||
########################################################################
|
||||
asset_list = read_asset_list()
|
||||
asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list)
|
||||
|
||||
# Read in the properties that have been included in Osmosis' wave 2.1
|
||||
osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
|
||||
|
||||
asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)
|
||||
|
||||
# We also check the address & postcode
|
||||
asset_list["In Osmosis Wave 2.1"] = np.where(
|
||||
asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
|
||||
True,
|
||||
asset_list["In Osmosis Wave 2.1"]
|
||||
)
|
||||
|
||||
priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()
|
||||
|
||||
# Pull in the EPC data
|
||||
epc_data = read_epc_data(uprn_lookup_2)
|
||||
|
||||
# Pull in the spatial data to UPRN
|
||||
spatial_data_to_uprn = read_pickle_from_s3(
|
||||
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
# Function to convert specific columns to bool dtype
|
||||
def convert_specific_columns_to_bool(df, columns):
|
||||
for column in columns:
|
||||
if column in df.columns:
|
||||
df[column] = df[column].astype(bool)
|
||||
return df
|
||||
|
||||
spatial_data_to_uprn = [convert_specific_columns_to_bool(
|
||||
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
|
||||
) for df in spatial_data_to_uprn]
|
||||
|
||||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
spatial_data_to_uprn = spatial_data_to_uprn.drop(
|
||||
columns=["partition", "filename"]
|
||||
).rename(columns={"UPRN": "uprn"})
|
||||
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
|
||||
|
||||
########################################################################
|
||||
# Prepare the data
|
||||
########################################################################
|
||||
|
||||
# Filter the asset list down to the priority postcodes
|
||||
asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)
|
||||
|
||||
master_sheet = master_sheet[
|
||||
master_sheet["Address ID"].isin(
|
||||
asset_list["external_address_id"].values
|
||||
)
|
||||
]
|
||||
|
||||
master_sheet["days_since_lodgement"] = (
|
||||
datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
|
||||
).dt.days
|
||||
|
||||
asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
|
||||
master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
|
||||
how="left",
|
||||
left_on="external_address_id",
|
||||
right_on="Address ID"
|
||||
)
|
||||
|
||||
asset_list = asset_list.merge(
|
||||
epc_data[["internal_id", "current-energy-efficiency", "lodgement-date", "estimated"]],
|
||||
how="left",
|
||||
on="internal_id"
|
||||
)
|
||||
asset_list["days_since_lodgement_epc"] = (
|
||||
datetime.now() - pd.to_datetime(asset_list["lodgement-date"], errors="coerce", dayfirst=True)
|
||||
).dt.days
|
||||
|
||||
# Flag properties that were surveyed within the last 5 years
|
||||
asset_list["epc_within_5_years"] = asset_list["days_since_lodgement_epc"] < 5 * 365
|
||||
|
||||
# Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
|
||||
# a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
|
||||
# the EPC is done
|
||||
asset_list["is_epc_c_or_above"] = (
|
||||
((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
|
||||
(asset_list["EPC Rating"] >= 80)
|
||||
)
|
||||
|
||||
clustering_features = asset_list[
|
||||
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
|
||||
~pd.isnull(asset_list["uprn"])
|
||||
][
|
||||
[
|
||||
"internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
|
||||
"city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
|
||||
"epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
|
||||
]
|
||||
]
|
||||
|
||||
# Merge on the SAP data
|
||||
clustering_features = clustering_features.merge(
|
||||
master_sheet[
|
||||
["Address ID", "SAP"]
|
||||
].rename(columns={"SAP": "parity_modelled_sap"}),
|
||||
how="left",
|
||||
left_on="external_address_id",
|
||||
right_on="Address ID"
|
||||
)
|
||||
|
||||
# For SAP, we use the most recent EPC if epc_within_5_years is True, otherwise we use the parity modelled sap
|
||||
clustering_features["current-energy-efficiency"] = clustering_features["current-energy-efficiency"].astype(float)
|
||||
clustering_features["representative_sap"] = np.where(
|
||||
clustering_features["epc_within_5_years"],
|
||||
clustering_features["current-energy-efficiency"],
|
||||
clustering_features["parity_modelled_sap"]
|
||||
)
|
||||
|
||||
# We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
|
||||
# is too many
|
||||
clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
|
||||
|
||||
# Merge on spatial features
|
||||
clustering_features = clustering_features.merge(
|
||||
spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
|
||||
how="left",
|
||||
on="uprn"
|
||||
)
|
||||
|
||||
# incorect_epcs = clustering_features[
|
||||
# clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
|
||||
# incorect_epcs = incorect_epcs[
|
||||
# ~pd.isnull(incorect_epcs["current-energy-efficiency"]) & pd.isnull(incorect_epcs["estimated"])
|
||||
# ]
|
||||
# incorect_epcs = incorect_epcs.rename(columns={"current-energy-efficiency": "Current SAP Rating"})
|
||||
# # Store data
|
||||
# incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
|
||||
|
||||
# We add in the key features, which are used for clustering
|
||||
master_sheet_clustering_features = master_sheet[
|
||||
["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
|
||||
].copy()
|
||||
|
||||
# Step 1: Remap walls - we end up with 11 types
|
||||
master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
|
||||
{
|
||||
"TimberFrame: AsBuilt": "Other wall type, as built",
|
||||
"SystemBuilt: AsBuilt": "Other wall type, as built",
|
||||
"Sandstone: AsBuilt": "Other wall type, as built",
|
||||
"Sandstone: Internal": "Other wall type, internal or external",
|
||||
"SystemBuilt: External": "Other wall type, internal or external",
|
||||
"GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
|
||||
"TimberFrame: Internal": "Other wall type, internal or external",
|
||||
"Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
|
||||
"SystemBuilt: Internal": "Other wall type, internal or external",
|
||||
"Cavity: Internal": "Other wall type, internal or external",
|
||||
}
|
||||
)
|
||||
|
||||
# Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
|
||||
# gives us the insulation thickness
|
||||
|
||||
# Clean an incorrect value
|
||||
master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
|
||||
{
|
||||
"PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
|
||||
"PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
|
||||
'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
|
||||
'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
|
||||
'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
|
||||
}
|
||||
)
|
||||
|
||||
master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
|
||||
master_sheet_clustering_features['Roofs'].apply(
|
||||
lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
|
||||
)
|
||||
)
|
||||
|
||||
# Strip any extra whitespace
|
||||
master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
|
||||
master_sheet_clustering_features['roof_insulation_thickness'] = (
|
||||
master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
|
||||
)
|
||||
|
||||
def map_thickness(thickness):
|
||||
try:
|
||||
value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
|
||||
return "Above 250mm" if value > 250 else "Below 250mm"
|
||||
except ValueError:
|
||||
return thickness # Return the original value if it cannot be converted to a float
|
||||
|
||||
master_sheet_clustering_features['roof_insulation_category'] = (
|
||||
master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
|
||||
)
|
||||
|
||||
# Ideas
|
||||
# 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
|
||||
# as a secondary category
|
||||
# 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
|
||||
# (could split on :, take first part)
|
||||
|
||||
clustering_features = clustering_features.merge(
|
||||
master_sheet_clustering_features,
|
||||
how="left",
|
||||
on="Address ID"
|
||||
)
|
||||
|
||||
# Reduce down to the final set of features we need
|
||||
clustering_features = clustering_features[
|
||||
[
|
||||
"internal_id",
|
||||
"Property Type",
|
||||
# Location
|
||||
"postal_region",
|
||||
'conservation_status',
|
||||
'is_listed_building',
|
||||
'is_heritage_building',
|
||||
"county",
|
||||
# Walls
|
||||
"walls_reduced",
|
||||
# Roof
|
||||
"roof_type",
|
||||
"roof_insulation_category",
|
||||
# Heating
|
||||
"Heating",
|
||||
# Fuel
|
||||
"Main Fuel",
|
||||
"Age",
|
||||
"Total Floor Area",
|
||||
"representative_sap",
|
||||
"days_since_lodgement",
|
||||
]
|
||||
]
|
||||
|
||||
clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
|
||||
|
||||
def split_property_type(row):
|
||||
parts = row.split(':')
|
||||
property_type = parts[0].strip()
|
||||
built_form = parts[1].strip() if len(parts) > 1 else ''
|
||||
property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
|
||||
return pd.Series([property_type, built_form, property_extended_feature])
|
||||
|
||||
clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
|
||||
clustering_features['Property Type'].apply(split_property_type)
|
||||
)
|
||||
clustering_features = clustering_features.drop(columns=["Property Type"])
|
||||
|
||||
# These are the variables we MUST split by
|
||||
grouping_columns = [
|
||||
"property_type",
|
||||
"walls_reduced",
|
||||
"roof_type",
|
||||
"Main Fuel",
|
||||
"county",
|
||||
]
|
||||
|
||||
def combine_small_groups(clustering_features, grouping_columns, threshold=2):
|
||||
# Identify small groups
|
||||
group_sizes = clustering_features.groupby(grouping_columns).size()
|
||||
small_groups = group_sizes[group_sizes <= threshold].index.tolist()
|
||||
|
||||
# Remove small groups from the original clustering_features
|
||||
small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
|
||||
clustering_features_ok = clustering_features[
|
||||
~clustering_features.set_index(grouping_columns).index.isin(small_groups)
|
||||
]
|
||||
|
||||
if small_group_data.empty:
|
||||
return clustering_features
|
||||
|
||||
# One-Hot Encode categorical variables
|
||||
categorical_features = (
|
||||
clustering_features_ok.drop(columns=["internal_id"])
|
||||
.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
)
|
||||
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
||||
ohe.fit(clustering_features_ok[categorical_features])
|
||||
|
||||
# Combine small groups with the nearest available group
|
||||
small_group_ohe = ohe.transform(small_group_data[categorical_features])
|
||||
large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])
|
||||
|
||||
numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
small_group_numerical = small_group_data[numerical_features].values
|
||||
large_group_numerical = clustering_features_ok[numerical_features].values
|
||||
|
||||
# Concatenate one-hot encoded categorical and numerical features
|
||||
small_group_features = np.hstack([small_group_ohe, small_group_numerical])
|
||||
large_group_features = np.hstack([large_group_ohe, large_group_numerical])
|
||||
|
||||
# Calculate distances and find nearest groups
|
||||
closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
|
||||
closest_group_index = clustering_features_ok.iloc[closest_groups].index
|
||||
|
||||
# Update small groups to the nearest large group
|
||||
for small_group, closest_group in zip(small_groups, closest_group_index):
|
||||
small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
|
||||
small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
|
||||
closest_group, grouping_columns].values
|
||||
|
||||
combined_data = pd.concat([clustering_features_ok, small_group_data])
|
||||
return combined_data
|
||||
|
||||
clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
|
||||
|
||||
########################################################################
|
||||
# Clustering
|
||||
########################################################################
|
||||
numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
|
||||
|
||||
for col in categorical_features:
|
||||
clustering_features_combined[col] = clustering_features_combined[col].astype(str)
|
||||
|
||||
id_column = 'internal_id'
|
||||
n_clusters = 450
|
||||
random_state = 0
|
||||
|
||||
training_data_grouped = clustering_features_combined.groupby(grouping_columns)
|
||||
group_sizes = {name: len(group) for name, group in training_data_grouped}
|
||||
total_size = sum(group_sizes.values())
|
||||
cluster_allocation = {
|
||||
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
|
||||
}
|
||||
|
||||
# Adjust cluster allocation to ensure total clusters sum to 450
|
||||
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
|
||||
|
||||
final_clusters = []
|
||||
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
|
||||
|
||||
group_n_clusters = cluster_allocation[group_variables]
|
||||
group_data.set_index(id_column, inplace=True)
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', StandardScaler(), numerical_features),
|
||||
('cat', OneHotEncoder(), categorical_features)
|
||||
]
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
||||
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
|
||||
|
||||
# Fit the pipeline to the data
|
||||
pipeline.fit(group_data)
|
||||
|
||||
# Transform the data using the fitted pipeline
|
||||
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
|
||||
|
||||
# Get cluster labels
|
||||
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
|
||||
|
||||
# Get centroids (already in the same transformed space)
|
||||
centroids = pipeline.named_steps['kmeans'].cluster_centers_
|
||||
|
||||
# if the data isn't an array, make it one
|
||||
if not isinstance(processed_data, np.ndarray):
|
||||
processed_data = processed_data.toarray()
|
||||
|
||||
# Calculate distances from each point to the centroid of its cluster
|
||||
distances_to_centroids = [
|
||||
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
|
||||
for i, label in enumerate(group_data['cluster'])
|
||||
]
|
||||
|
||||
group_data['distance_to_centroid'] = distances_to_centroids
|
||||
|
||||
# Ranking rows by distance within each cluster
|
||||
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
|
||||
|
||||
# Sorting to verify
|
||||
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
group_data.reset_index(inplace=True)
|
||||
|
||||
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
|
||||
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
|
||||
final_clusters.append(to_append)
|
||||
|
||||
final_clusters = pd.concat(final_clusters)
|
||||
# remap the clusters from the current names to 1 -> n_clusters
|
||||
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
|
||||
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
|
||||
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
|
||||
|
||||
assigned_clusters = clustering_features_combined.merge(
|
||||
final_clusters, how="left", on="internal_id"
|
||||
)
|
||||
|
||||
assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
|
||||
|
||||
asset_list_with_archetypes = asset_list.merge(
|
||||
assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
|
||||
on="internal_id"
|
||||
)
|
||||
|
||||
# We populate the reasons for no archetype
|
||||
# 1) If it's not a priority postcode
|
||||
asset_list_with_archetypes["cluster"] = np.where(
|
||||
~asset_list_with_archetypes["is_priority_postcode"],
|
||||
"NOT PRIORITY POSTCODE",
|
||||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
# 2) If it's EPC C or above
|
||||
asset_list_with_archetypes["cluster"] = np.where(
|
||||
asset_list_with_archetypes["is_epc_c_or_above"],
|
||||
"EPC C OR ABOVE",
|
||||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
# If it's in Wave 2.1
|
||||
asset_list_with_archetypes["cluster"] = np.where(
|
||||
asset_list_with_archetypes["In Osmosis Wave 2.1"],
|
||||
"IN WAVE 2.1",
|
||||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
# Has missing uprn
|
||||
asset_list_with_archetypes["cluster"] = np.where(
|
||||
pd.isnull(asset_list_with_archetypes["uprn"]),
|
||||
"MISSING UPRN",
|
||||
asset_list_with_archetypes["cluster"]
|
||||
)
|
||||
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
|
||||
|
||||
asset_list_with_archetypes["archetype_representative"] = (
|
||||
asset_list_with_archetypes["archetype_representative"].fillna(False)
|
||||
)
|
||||
|
||||
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.csv", index=False)
|
||||
|
||||
# Produce the archetyping features
|
||||
archetyping_features_csv = assigned_clusters[
|
||||
[
|
||||
"internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
|
||||
"is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
|
||||
]
|
||||
].merge(
|
||||
asset_list[
|
||||
["internal_id", "uprn", "external_address_id"]
|
||||
],
|
||||
how="left",
|
||||
on="internal_id"
|
||||
).merge(
|
||||
master_sheet_clustering_features,
|
||||
how="left",
|
||||
right_on="Address ID",
|
||||
left_on="external_address_id"
|
||||
).drop(columns=["Address ID"]).rename(
|
||||
columns={
|
||||
"internal_id": "Osm. ID",
|
||||
"external_address_id": "Address ID",
|
||||
}
|
||||
)
|
||||
|
||||
archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
|
||||
archetyping_features_csv.to_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
|
||||
)
|
||||
|
||||
representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
|
||||
print(representatives["postal_region"].nunique())
|
||||
print(representatives["county"].nunique())
|
||||
|
||||
|
||||
def read_asset_list():
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
||||
header=4
|
||||
)
|
||||
|
||||
udprn_data = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
|
||||
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
|
||||
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
|
||||
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
|
||||
|
||||
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
|
||||
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
|
||||
|
||||
asset_list = asset_list.rename(
|
||||
columns={
|
||||
"Osm. ID": "internal_id",
|
||||
"Org. ref.": "customer_asset_id",
|
||||
"Postcode": "postcode",
|
||||
"House no": "house_number",
|
||||
"Name": "address1",
|
||||
"Address line 2": "address2",
|
||||
"City/Town": "city_town",
|
||||
"County": "county",
|
||||
"Address ID": "external_address_id",
|
||||
"Owning body": "owner"
|
||||
}
|
||||
)
|
||||
|
||||
asset_list["full_address"] = np.where(
|
||||
~pd.isnull(asset_list["address2"]),
|
||||
(
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["address2"] + ", " +
|
||||
asset_list["city_town"].str.title() + ", " +
|
||||
asset_list["postcode"]
|
||||
),
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["city_town"].str.title() + ", " +
|
||||
asset_list["postcode"]
|
||||
)
|
||||
return asset_list
|
||||
|
||||
|
||||
def merge_uprn_to_asset_list(asset_list):
|
||||
# Read in the lookups
|
||||
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
|
||||
)))
|
||||
uprn_lookup_1["match_type"] = "Exact"
|
||||
|
||||
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
|
||||
)))
|
||||
uprn_lookup_2 = uprn_lookup_2.rename(
|
||||
columns={
|
||||
"epc_address": "standardised_address",
|
||||
"epc_postcode": "standardised_postcode"
|
||||
}
|
||||
)
|
||||
uprn_lookup_2["match_type"] = "EPC"
|
||||
uprn_lookup_2["uprn"] = np.where(
|
||||
uprn_lookup_2["internal_id"] == 1091,
|
||||
83143766,
|
||||
uprn_lookup_2["uprn"]
|
||||
)
|
||||
|
||||
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
|
||||
)))
|
||||
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
|
||||
concatenate_row, axis=1
|
||||
)
|
||||
uprn_lookup_3 = uprn_lookup_3[
|
||||
["udprn", "uprn", "standardised_address", "postcode"]
|
||||
].rename(columns={"postcode": "standardised_postcode"})
|
||||
uprn_lookup_3["match_type"] = "Exact"
|
||||
|
||||
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
|
||||
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
|
||||
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
|
||||
# prepare lookup 4
|
||||
uprn_lookup_4 = []
|
||||
for _, x in uprn_lookup_4_basis.iterrows():
|
||||
|
||||
property_type = None
|
||||
built_form = None
|
||||
if x["option"] == 1:
|
||||
uprn = x["os_option_1_uprn"]
|
||||
standardised_address = x["os_option_1_address"]
|
||||
postcode = x["os_option_1_postcode"]
|
||||
elif x["option"] == 2:
|
||||
uprn = x["os_option_2_uprn"]
|
||||
standardised_address = x["os_option_2_address"]
|
||||
postcode = x["os_option_2_address"].split(", ")[-1]
|
||||
else:
|
||||
uprn = x["manual_uprn"]
|
||||
standardised_address = x["manual_address"]
|
||||
postcode = x["manual_postcode"]
|
||||
|
||||
uprn_lookup_4.append(
|
||||
{
|
||||
"internal_id": x["internal_id"],
|
||||
"external_address_id": x["external_address_id"],
|
||||
"uprn": uprn,
|
||||
"standardised_address": standardised_address,
|
||||
"standardised_postcode": postcode,
|
||||
"property_type": property_type,
|
||||
"built_form": built_form
|
||||
}
|
||||
)
|
||||
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
|
||||
uprn_lookup_4["match_type"] = "Fuzzy"
|
||||
|
||||
# concat
|
||||
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
|
||||
|
||||
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
|
||||
|
||||
# Final preps of lookups
|
||||
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
|
||||
uprn_lookup_3 = uprn_lookup_3.merge(
|
||||
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
|
||||
)
|
||||
uprn_lookup = pd.concat([
|
||||
uprn_lookup,
|
||||
uprn_lookup_3,
|
||||
uprn_lookup_4
|
||||
])
|
||||
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
|
||||
|
||||
asset_list = asset_list.merge(
|
||||
uprn_lookup.drop(columns=["udprn"]),
|
||||
how="inner",
|
||||
on=["internal_id", "external_address_id"]
|
||||
)
|
||||
|
||||
return asset_list, uprn_lookup_2
|
||||
|
||||
|
||||
def read_omosis_wave_2_1():
|
||||
osmosis_wave_2_1 = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
|
||||
header=4,
|
||||
)
|
||||
# Remove double spaces from "Name"
|
||||
osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace(" ", " ")
|
||||
|
||||
osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
|
||||
osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
|
||||
# We produce a cleaned list of asset ids from osmosis_wave_2_1
|
||||
osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
|
||||
# We have some ids that are in the form 'id1, id2' so we split them
|
||||
osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]
|
||||
|
||||
return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1
|
||||
|
||||
|
||||
def read_stonewater_asset_data():
|
||||
master_sheet = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
|
||||
"sheet.csv",
|
||||
encoding='latin1'
|
||||
)
|
||||
|
||||
master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)
|
||||
|
||||
previous_waves = master_sheet[
|
||||
(master_sheet["In Osmosis W2.1"] == "Yes") |
|
||||
(master_sheet["In Wates Wave 2.1"] == "Yes") |
|
||||
(master_sheet["In Liv Green Wave 2.1"] == "Yes") |
|
||||
(master_sheet["In CCS Wave 2.1"] == "Yes")
|
||||
].copy()
|
||||
|
||||
previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]
|
||||
|
||||
# We also read the priority postcodes
|
||||
priority_postcodes = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
|
||||
"postcodes.csv",
|
||||
header=17
|
||||
)
|
||||
|
||||
priority_postcodes = priority_postcodes["Postcode"].tolist()
|
||||
|
||||
return priority_postcodes, previous_waves_address_id, master_sheet
|
||||
|
||||
|
||||
def read_epc_data(uprn_lookup_2):
|
||||
epc_data = json.loads(
|
||||
read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/epc_data.json"
|
||||
)
|
||||
)
|
||||
epc_data = pd.DataFrame(epc_data)
|
||||
|
||||
epc_data["uprn"] = np.where(
|
||||
epc_data["internal_id"] == 1091,
|
||||
83143766,
|
||||
epc_data["uprn"]
|
||||
)
|
||||
|
||||
# We drop come EPCS
|
||||
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
|
||||
|
||||
epc_data_batch_2 = read_pickle_from_s3(
|
||||
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
|
||||
|
||||
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
|
||||
|
||||
return complete_epcs
|
||||
|
|
|
|||
80
etl/sfr/epc_average_by_postcode.py
Normal file
80
etl/sfr/epc_average_by_postcode.py
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from dotenv import load_dotenv
|
||||
from backend.SearchEpc import SearchEpc
|
||||
from backend.app.utils import sap_to_epc
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
This script will retrieve EPC data, for postcodes and produce statistics on the SAP Score
|
||||
:return:
|
||||
"""
|
||||
|
||||
source_file = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Addresses - SFR rents.xlsx")
|
||||
source_file["row_id"] = source_file.index
|
||||
# Split out the town, which is the final portion of the string, separated by commas
|
||||
source_file["Town"] = source_file["Address"].apply(lambda x: x.split(" ")[-1].strip() if not pd.isnull(x) else None)
|
||||
source_file["Address"] = source_file["Address"].apply(
|
||||
lambda x: " ".join(x.split(" ")[:-1]).strip() if not pd.isnull(x) else None
|
||||
)
|
||||
|
||||
unique_postcodes = source_file[["Address", "Postcode"]].drop_duplicates()
|
||||
|
||||
# for each postcode, pull EPC data
|
||||
collected_data = []
|
||||
no_data_found = []
|
||||
no_data_after_filters = []
|
||||
for _, config in tqdm(unique_postcodes.iterrows(), total=len(unique_postcodes)):
|
||||
address1 = config["Address"] if not pd.isnull(config["Address"]) else ""
|
||||
searcher = SearchEpc(
|
||||
postcode=config["Postcode"],
|
||||
address1=address1,
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key=""
|
||||
)
|
||||
while True:
|
||||
params = {
|
||||
"postcode": config["Postcode"],
|
||||
"address": address1,
|
||||
}
|
||||
results = searcher.client.domestic.search(params=params, size=10000)
|
||||
if not results:
|
||||
# We strip back address1
|
||||
address1 = " ".join(address1.split(" ")[:-1])
|
||||
if not address1:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
if not results:
|
||||
no_data_found.append(config)
|
||||
continue
|
||||
|
||||
data = pd.DataFrame(results["rows"])
|
||||
|
||||
data["current-energy-efficiency"] = data["current-energy-efficiency"].astype(int)
|
||||
# Take EPCs post 2023
|
||||
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], errors="coerce")
|
||||
data = data[data["lodgement-date"] >= "2023-01-01"]
|
||||
# Take private nrentals
|
||||
data = data[data["tenure"].isin(["rental (private)", "Rented (private)"])]
|
||||
|
||||
if data.empty:
|
||||
no_data_after_filters.append(config)
|
||||
continue
|
||||
|
||||
agg = data.groupby(["property-type", "built-form"])["current-energy-efficiency"].mean().reset_index()
|
||||
agg = agg.rename(columns={"current-energy-efficiency": "Average SAP"})
|
||||
agg["Average EPC"] = agg["Average SAP"].apply(sap_to_epc)
|
||||
agg.insert(0, "Postcode", config["Postcode"])
|
||||
agg.insert(0, "Address", address1)
|
||||
|
||||
collected_data.append(agg)
|
||||
|
||||
collected_df = pd.concat(collected_data)
|
||||
collected_df.to_csv("EPC Averages SFR.csv", index=False)
|
||||
Loading…
Add table
Reference in a new issue