Merge pull request #309 from Hestia-Homes/stonewater

Stonewater
This commit is contained in:
KhalimCK 2024-07-01 15:30:01 +01:00 committed by GitHub
commit 8b70a71241
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 719 additions and 214 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

5
.idea/misc.xml generated
View file

@ -3,7 +3,10 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 7.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

View file

@ -0,0 +1,8 @@
import os
import json
import dotenv
# When running locally, we'll need to load the .env file
dotenv.load_dotenv()
MAPBOX_ACCESS_TOKEN = os.getenv("MAPBOX_ACCESS_TOKEN")

View file

@ -0,0 +1,230 @@
import dash_bootstrap_components as dbc
from dash import html, dcc
import json
import plotly.graph_objects as go
import pandas as pd
from config import MAPBOX_ACCESS_TOKEN
def make_real_epc_piechart(real_epc_breakdown):
labels = [x["is_real_epc"] for x in real_epc_breakdown]
values = [x["count"] for x in real_epc_breakdown]
marker_colors = ["#027fa6", "rgb(225 225 225)"]
fig = go.Figure(
data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors)],
)
fig.update_layout(margin={"t": 0})
plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
return plot
def make_epc_rating_piechart(epc_rating_breakdown):
# Re-order from G to A
epc_rating_breakdown = sorted(epc_rating_breakdown, key=lambda x: x["EPC"])
labels = [x["EPC"] for x in epc_rating_breakdown]
values = [x["count"] for x in epc_rating_breakdown]
marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
fig = go.Figure(
data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors, sort=False)],
)
fig.update_layout(margin={"t": 0})
plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
return plot
def make_map(locations):
if not locations:
return None
df = pd.DataFrame(locations)
# Create custom hover text
df['hover_text'] = df.apply(
lambda row: f"UPRN: {int(row['uprn'])}<br>Address: {row['standardised_address']}<br>Postcode: "
f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: {row['LONGITUDE']}",
axis=1)
data = [
go.Scattermapbox(
lat=df["LATITUDE"].tolist(),
lon=df["LONGITUDE"].tolist(),
mode="markers",
marker=go.scattermapbox.Marker(size=10, color="#027fa6"),
text=df["hover_text"], # Use the custom hover text
hoverinfo='text'
)
]
layout = go.Layout(
autosize=True,
hovermode="closest",
mapbox=go.layout.Mapbox(
accesstoken=MAPBOX_ACCESS_TOKEN,
bearing=0,
center=go.layout.mapbox.Center(lat=53, lon=-1.5),
pitch=0,
zoom=5,
),
margin={"t": 0},
)
fig = go.Figure(data=data, layout=layout)
plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
return plot
def layout():
# Get the data
with open("Stonewater Mapping Data.json", "r") as file:
locations = json.load(file)
# Get the EPC breakdown data
with open("Stonewater real EPC breakdown.json") as file:
real_epc_breakdown = json.load(file)
# Get the EPC ratings data
with open("Stonewater EPC rating breakdown.json") as file:
epc_rating_breakdown = json.load(file)
page = dbc.Container(
[
dbc.Row(
dbc.Col(
html.Div(
[
# Banner with logos
dbc.Row(
[
dbc.Col(
html.Img(src="assets/stonewater-logo.png", height="50px"),
width="auto"
),
dbc.Col(
html.Img(src="assets/osmosis-Logo.svg", height="50px"),
width="auto"
),
dbc.Col(
html.Div(
style={"color": "white", "font-size": "1.5rem", "font-weight": "bold"}
),
width=True,
className="text-center"
)
],
className="align-items-center",
style={"background-color": "#027fa6", "padding": "10px"}
),
dbc.Row(
[
dbc.Col("Powered by", style={"color": "#027fa6", "fontSize": "1rem", 'zIndex': 10},
width="auto"),
dbc.Col(
html.A(
html.Img(src="assets/hestia-logo.png", height="50px"),
href="https://hestia.homes",
),
width="auto",
style={"margin-left": "-60px"}
),
],
justify='left',
align="center"
),
html.H1(
"Stonewater Survey Map",
style={"font-size": "2.5rem", "font-weight": "bold", "margin-bottom": "20px"}
),
html.P(
"This map shows the location of the properties that are to be surveyed by Osmosis.",
style={"font-size": "1.25rem", "margin-bottom": "40px"}
),
],
className="text-center"
),
width=12
),
className="mt-5"
),
dbc.Row(
dbc.Col(
make_map(locations=locations),
width=10,
align="center",
className="text-center"
),
justify="center"
),
dbc.Row(
[
dbc.Col(
[
html.Div(
"Breakdown of real EPCs",
style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
className='text-center'
),
html.Div(
"This pie chart shows the proportion of real EPCs in the asset list. Currently, "
"there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
style={"marginBottom": "1em"}
),
make_real_epc_piechart(real_epc_breakdown),
],
width={"size": 5},
),
dbc.Col(
[
html.Div(
"EPC Ratings for properties with an EPC",
style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
className='text-center'
),
html.Div(
[
"This pie chart shows the breakdown of EPC ratings, for properties that currently "
"have an EPC. "
"The ratings range from A to G, where surprisingly, there are two EPC properties "
"that were initially "
"expected by Parity's modelled SAP, to be EPC D or below. These properties can be"
" seen ",
html.A("here",
href="https://find-energy-certificate.service.gov.uk/energy-certificate"
"/2708-5001-7327-6090-7284",
target="_blank"),
" and ",
html.A("here",
href="https://find-energy-certificate.service.gov.uk/energy-certificate"
"/1037-4032-1009-0361-7292",
target="_blank"),
"."
],
style={"marginBottom": "1em"}
),
make_epc_rating_piechart(epc_rating_breakdown),
],
width={"size": 5},
),
],
justify="center"
)
],
fluid=True,
className="p-5"
)
return page

View file

@ -0,0 +1,12 @@
dash==2.8.1
gunicorn
pandas
dash-bootstrap-components==1.3.1
boto3
dropbox
Flask-Caching
dash-extensions
mysql-connector-python
sqlalchemy
werkzeug==2.3.7
python-dotenv

View file

@ -0,0 +1,45 @@
import logging
import secrets
import dash_bootstrap_components as dbc
from dash import html
from dash_extensions.enrich import DashProxy, MultiplexerTransform
import flask
from map_page import layout
logger = logging.getLogger(__name__)
# We just use a simple secret key for the moment
SECRET_KEY = secrets.token_hex(24)
def init_app():
app = DashProxy(
__name__,
server=flask.Flask(__name__),
suppress_callback_exceptions=True,
external_stylesheets=[
dbc.themes.BOOTSTRAP,
dbc.icons.FONT_AWESOME,
"https://fonts.googleapis.com/css?family=Comfortaa",
],
transforms=[MultiplexerTransform()]
)
server = app.server
# Set app config
server.config.update(
SECRET_KEY=SECRET_KEY,
)
app.title = "Hesta X Stonewater"
# Define the layout
app.layout = layout()
return app
app = init_app()

View file

@ -0,0 +1,8 @@
# Callbacks must be imported to run the app
import callbacks # NOQA
from server import app
application = app.server
if __name__ == "__main__":
app.run_server(port=8080, debug=True, host="0.0.0.0")

View file

@ -0,0 +1,132 @@
"""
This script prepares some outputs for the stonewater project, 27th June 2024
The work done so far has been data cleaning and clustering.
In this script, we do the following things:
1) Match the clustering data to the archetypes
2) Do some basic analysis on the data
3) Mapping of the archetypes
"""
import pandas as pd
import json
from utils.s3 import read_pickle_from_s3
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
archetyped_asset_list = stonewater_asset_list[
[
"internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
"archetype_representative", "rank"
]
].copy()
archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
# Sort
archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
# Read in and merge on clustering features
clustering_features = read_pickle_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
)
# Move property-type and built-form to the first two columns
columns_to_move = ['property-type', 'built-form']
# Get the remaining columns
remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
# Create the new column order
new_column_order = columns_to_move + remaining_columns
# Reorder the DataFrame
clustering_features = clustering_features[new_column_order]
archetyped_asset_list = archetyped_asset_list.merge(
clustering_features,
on="internal_id",
how="inner"
)
archetyped_asset_list = archetyped_asset_list.rename(
columns={
"internal_id": "Osm. ID",
"customer_asset_id": "Org. ref.",
"external_address_id": "Address ID",
"cluster": "Archetype ID",
"archetype_representative": "Archetype Representative",
"rank": "Archetype Group Rank",
}
)
archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
# Create an extract of the features
# Look at number of combinations
# - If we look at the number of combinations of property type & built form, we have 25 unique combinations
# - If we look at the number of combinations of property type, built form, and walls description, this jumps
# massively to 237 unique combinations
# - Adding roof description to the mix, we have 857 unique combinations
# - Adding floor description, we have 1278 unique combinations
# This doesn't even begin to consider the other variables that we have in the dataset, such as the property dimensions,
# location, and other factors.
# Ideally, we would perfectly separate these variables but this is not possible, given the constraint of needing ~450
# archetypes. We will need to make some compromises here. This is where a clustering algorithm can help us.
# We don't end up with perfect separation but we can get a good enough separation to make the archetypes useful, and can
# base the archetypes on a number of energy performance metrics, as well as location and other factors.
# archetyped_asset_list[
# ["property-type", "built-form", "walls-description", "roof-description",
# "floor-description"]].drop_duplicates().shape
# Save this as an excel
# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
# We store the location data, which will be used for the mapping. We just need the longitude and latitude
mapping_data = stonewater_asset_list[
stonewater_asset_list["archetype_representative"]
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
mapping_data = mapping_data.merge(
clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
)
mapping_data = mapping_data.drop(columns=["internal_id"])
with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w") as f:
f.write(json.dumps(mapping_data.to_dict(orient="records")))
# We also include some data for visualising the breakdown of EPCS
proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
# Invert the true and false
proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
proportion_of_real_epcs = proportion_of_real_epcs.rename(
columns={"estimated": "is_real_epc"}
)
with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
# Produce the breakdown of EPC ratings
epc_rating_breakdown = (
clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
.value_counts()
.to_frame()
.reset_index()
)
epc_rating_breakdown = epc_rating_breakdown.rename(
columns={"current-energy-rating": "EPC"}
)
with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
epc_a_properties = clustering_features[
(clustering_features["current-energy-rating"] == "A")
& (~clustering_features["estimated"])
]
epc_a_properties = epc_a_properties.merge(
stonewater_asset_list,
on="internal_id",
how="inner"
)

View file

@ -14,6 +14,11 @@ import pandas as pd
import time
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
save_dataframe_to_s3_parquet, save_pickle_to_s3
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -673,7 +678,8 @@ def compile_data():
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
@ -1090,6 +1096,26 @@ def concatenate_row(row):
return ', '.join(row.dropna().replace('', None).dropna().astype(str))
def adjust_clusters(cluster_allocation, total_clusters):
current_total = sum(cluster_allocation.values())
adjustment = total_clusters - current_total
if adjustment > 0:
# Increase clusters, start from the largest group
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
cluster_allocation[group] += 1
adjustment -= 1
if adjustment == 0:
break
elif adjustment < 0:
# Decrease clusters, start from the largest group
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
cluster_allocation[group] -= 1
adjustment += 1
if adjustment == 0:
break
return cluster_allocation
def compile_data_final():
# Updated version:
@ -1103,7 +1129,8 @@ def compile_data_final():
########################################################################
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
@ -1633,124 +1660,243 @@ def compile_data_final():
# )
# from utils.s3 import read_pickle_from_s3
# data = read_pickle_from_s3(
# property_attributes = read_pickle_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
# )
# We perform some additional cleaning on the data
import msgpack
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
cleaners = {
"floor-description": FloorAttributes,
'hotwater-description': HotWaterAttributes,
'main-fuel': MainFuelAttributes,
'mainheat-description': MainHeatAttributes,
'mainheatcont-description': MainheatControlAttributes,
'roof-description': RoofAttributes,
'walls-description': WallAttributes,
'windows-description': WindowAttributes,
'lighting-description': LightingAttributes
}
for variable_to_clean in cleaned.keys():
unique_descriptions = property_attributes[variable_to_clean].unique()
clean_df = pd.DataFrame(cleaned[variable_to_clean])
# Check if we have any
missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
if missed:
descriptions_to_append = []
for description in missed:
if variable_to_clean == "lighting-description":
cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
else:
cln = cleaners[variable_to_clean](description)
to_append = {
"original_description": description,
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
**cln.process()
}
descriptions_to_append.append(to_append)
descriptions_to_append = pd.DataFrame(descriptions_to_append)
clean_df = pd.concat([clean_df, descriptions_to_append])
clean_df = clean_df.rename(
columns={
"thermal_transmittance": f"{variable_to_clean}_thermal_transmittance",
"is_assumed": f"{variable_to_clean}_is_assumed",
}
)
if 'thermal_transmittance_unit' in clean_df.columns:
clean_df = clean_df.drop(columns=['thermal_transmittance_unit'])
starting_size = len(property_attributes)
property_attributes = property_attributes.merge(
clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
)
if starting_size != property_attributes.shape[0]:
raise Exception("something went wrong")
property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
# Fill missings
for k in clean_df.columns:
if k in property_attributes.columns:
property_attributes[k] = property_attributes[k].fillna("missing")
# We group some variables such as thermal transmittance for walls, roof, floors
# ranges = {
# "< 0.1": (0, 0.1),
# "0.1 - 0.3": (0.1, 0.3),
# "0.3 - 0.5": (0.3, 0.5),
# "0.5 - 0.7": (0.5, 0.7),
# "0.9 - 1": (0.9, 1),
# "1 - 1.5": (1, 1.5),
# "1.5 - 2": (1.5, 2),
# "2+": (2, 2.5)
# }
ranges = {
"< 0.1": (0, 0.1),
"0.1 - 0.3": (0.1, 0.3),
"0.3 - 0.5": (0.3, 0.5),
"0.5+": (0.5, 2.5),
}
# Generate the lookup table
thermal_transmittance_lookup_table = []
for i in range(1, 251):
value = i / 100
for label, (low, high) in ranges.items():
if low < value <= high:
thermal_transmittance_lookup_table.append({"from": value, "to": label})
break
# Convert to DataFrame for display
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
thermal_transmittance_cols = [
c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c
]
for i, col in enumerate(thermal_transmittance_cols):
# Perform the mapping
to_col = f"to_{col}"
property_attributes[col] = property_attributes[col].astype(str)
property_attributes = property_attributes.merge(
thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
how="left",
left_on=col,
right_on="from",
suffixes=("", f"_{i}")
)
property_attributes = property_attributes.drop(columns=["from", col])
property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
# Drop the description columns that are the keys in cleaned
print("PUT ME BACK!!??")
property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
# Perform the mapping
# CLUSTERING!!
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import OneHotEncoder
# from scipy.spatial.distance import cdist
#
# property_attributes.set_index('internal_id', inplace=True)
#
# # Step 1: Prepare the data
# # Identify categorical columns (you might need to adjust this)
# categorical_cols = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
# for col in categorical_cols:
# property_attributes[col] = property_attributes[col].astype(str)
#
# # Applying OneHotEncoder
# encoder = OneHotEncoder(sparse=False)
# encoded_cats = encoder.fit_transform(property_attributes[categorical_cols])
#
# # Creating a new DataFrame with encoded categorical data and original numerical data
# numerical_data = property_attributes.select_dtypes(include=[np.number])
# data_for_clustering = pd.concat([numerical_data, pd.DataFrame(encoded_cats, index=numerical_data.index)], axis=1)
#
# # Convert all column names to strings to satisfy KMeans requirements
# data_for_clustering.columns = data_for_clustering.columns.astype(str)
#
# # Step 2: K-Means Clustering
# k = 450 # number of clusters
# kmeans = KMeans(n_clusters=k, random_state=0)
# property_attributes['cluster'] = kmeans.fit_predict(data_for_clustering)
#
# # Extracting centroids
# centroids = kmeans.cluster_centers_
#
# # Step 3: Assign clusters and rank rows
# # Calculating distances from each point to its cluster's centroid
# distances = cdist(data_for_clustering, centroids, 'euclidean')
# min_distances = distances.min(axis=1)
# property_attributes['distance_to_centroid'] = min_distances
#
# # Ranking rows by distance within each cluster
# property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(method='first')
#
# # Sorting to verify
# property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
#
# # Optional: Displaying the dataframe
# print(property_attributes.head())
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
id_column = 'internal_id'
property_attributes.set_index(id_column, inplace=True)
grouping_columns = [
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
]
# Define the preprocessing for numerical and categorical features
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
for col in categorical_features:
property_attributes[col] = property_attributes[col].astype(str)
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)
id_column = 'internal_id'
n_clusters = 450
random_state = 0
training_data_grouped = property_attributes.groupby(grouping_columns)
group_sizes = {name: len(group) for name, group in training_data_grouped}
total_size = sum(group_sizes.values())
cluster_allocation = {
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
}
# Adjust cluster allocation to ensure total clusters sum to 450
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
# TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to
# collect the results of the clustering and then perform the transformations afterwards
final_clusters = []
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
group_n_clusters = cluster_allocation[group_variables]
group_data.set_index(id_column, inplace=True)
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)
]
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
# Fit the pipeline to the data
pipeline.fit(group_data)
# Transform the data using the fitted pipeline
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
# Get cluster labels
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
# Get centroids (already in the same transformed space)
centroids = pipeline.named_steps['kmeans'].cluster_centers_
# if the data isn't an array, make it one
if not isinstance(processed_data, np.ndarray):
processed_data = processed_data.toarray()
# Calculate distances from each point to the centroid of its cluster
distances_to_centroids = [
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
for i, label in enumerate(group_data['cluster'])
]
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('kmeans', KMeans(n_clusters=450, random_state=0))])
group_data['distance_to_centroid'] = distances_to_centroids
# Fit the pipeline to the data
pipeline.fit(property_attributes)
# for cluster_id in group_data['cluster'].unique():
# cluster_data = group_data[group_data['cluster'] == cluster_id]
# min_distance = cluster_data['distance_to_centroid'].min()
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
# if min_distance != 0:
# print(f"No point with zero distance found in cluster {cluster_id}")
# Transform the data using the fitted pipeline
processed_data = pipeline.named_steps['preprocessor'].transform(property_attributes)
# Ranking rows by distance within each cluster
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
# Get cluster labels
property_attributes['cluster'] = pipeline.named_steps['kmeans'].labels_
# Sorting to verify
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
group_data.reset_index(inplace=True)
# Get centroids (already in the same transformed space)
centroids = pipeline.named_steps['kmeans'].cluster_centers_
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
final_clusters.append(to_append)
processed_data = processed_data.toarray()
final_clusters = pd.concat(final_clusters)
# remap the clusters from the current names to 1 -> n_clusters
# Calculate distances from each point to the centroid of its cluster
distances_to_centroids = [
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
for i, label in enumerate(property_attributes['cluster'])
]
property_attributes['distance_to_centroid'] = distances_to_centroids
for cluster_id in property_attributes['cluster'].unique():
cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
min_distance = cluster_data['distance_to_centroid'].min()
print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
if min_distance != 0:
print(f"No point with zero distance found in cluster {cluster_id}")
# Ranking rows by distance within each cluster
property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(
method='first')
# Sorting to verify
property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
################################################
# Prepare outputs!!!!
################################################
property_attributes.reset_index(inplace=True)
property_attributes = property_attributes.merge(
final_clusters, how="left", on="internal_id"
)
property_attributes["archetype_representative"] = property_attributes["rank"] == 1
asset_list_with_archetypes = asset_list.merge(
@ -1769,7 +1915,7 @@ def compile_data_final():
asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
"archetype_representative"].fillna(False)
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False)
stonewater_uprn_lookup = asset_list_with_archetypes[
["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
@ -1777,110 +1923,6 @@ def compile_data_final():
stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
################################################
# Agglomertive Clustering
################################################
# from sklearn.cluster import KMeans, AgglomerativeClustering
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from scipy.spatial.distance import cdist
# import numpy as np
# from collections import Counter
#
# id_column = 'internal_id'
# property_attributes.set_index(id_column, inplace=True)
#
# # Define the preprocessing for numerical and categorical features
# numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
# categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
#
# for col in categorical_features:
# property_attributes[col] = property_attributes[col].astype(str)
#
# preprocessor = ColumnTransformer(
# transformers=[
# ('num', StandardScaler(), numerical_features),
# ('cat', OneHotEncoder(sparse_output=False), categorical_features)
# ]
# )
#
# # Function to perform clustering and merge small clusters
# def cluster_with_min_size(data, preprocessor, n_clusters=10, min_size=5):
# while True:
# # Preprocess the data
# processed_data = preprocessor.fit_transform(data)
#
# # Initial clustering
# clustering = AgglomerativeClustering(n_clusters=n_clusters)
# labels = clustering.fit_predict(processed_data)
#
# # Check cluster sizes
# cluster_counts = Counter(labels)
#
# # Find clusters smaller than min_size
# small_clusters = {cluster for cluster, count in cluster_counts.items() if count < min_size}
#
# if not small_clusters:
# break
#
# # Merge small clusters
# for cluster in small_clusters:
# # Find the nearest cluster to merge with
# cluster_data = processed_data[labels == cluster]
# other_clusters = [i for i in range(n_clusters) if i not in small_clusters]
# other_cluster_data = [processed_data[labels == i] for i in other_clusters]
# other_centroids = np.vstack([data.mean(axis=0) for data in other_cluster_data])
#
# distances = cdist(cluster_data, other_centroids).mean(axis=0)
# closest_cluster = other_clusters[np.argmin(distances)]
#
# labels[labels == cluster] = closest_cluster
#
# n_clusters -= len(small_clusters)
#
# return labels
#
# # Perform clustering with minimum size constraint
# n_clusters = 10
# min_size = 5
# property_attributes['cluster'] = cluster_with_min_size(property_attributes, preprocessor, n_clusters, min_size)
#
# # Filter out empty clusters
# valid_clusters = property_attributes['cluster'].unique()
#
# # Get centroids for the resulting clusters
# processed_data = preprocessor.transform(property_attributes.drop(columns=["cluster"]))
# centroids = np.vstack([processed_data[property_attributes['cluster'] == i].mean(axis=0) for i in valid_clusters])
#
# # Calculate distances from each point to the centroid of its cluster
# distances_to_centroids = [
# cdist(processed_data[i].reshape(1, -1),
# centroids[valid_clusters.tolist().index(label)].reshape(1, -1)).flatten()[0]
# for i, label in enumerate(property_attributes['cluster'])
# ]
#
# property_attributes['distance_to_centroid'] = distances_to_centroids
#
# # Verify that at least one point in each cluster has zero distance to the centroid
# for cluster_id in valid_clusters:
# cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
# min_distance = cluster_data['distance_to_centroid'].min()
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
# if min_distance != 0:
# print(f"No point with zero distance found in cluster {cluster_id}")
#
# # Rank the distances within each cluster
# property_attributes['rank_within_cluster'] = property_attributes.groupby('cluster')['distance_to_centroid'] \
# .rank(method='first')
#
# # Reset index to get 'internal_id' back
# property_attributes.reset_index(inplace=True)
#
# # Display the DataFrame
# print(property_attributes)
def pull_ideal_postcodes(missing_uprn_with_udprn):
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/

View file

@ -38,7 +38,7 @@ class FloorAttributes(Definitions):
self.description: str = description.lower()
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
description in self.OBSERVED_ERRORS)
description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor")
# Try and perform a translation, incase it's in welsh
self.translate_welsh_text()

View file

@ -129,7 +129,9 @@ class HotWaterAttributes(Definitions):
def __init__(self, description: str):
self.description: str = clean_description(description.lower()).strip()
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
self.description == "sap05 hot-water"
)
translation = self.WELSH_TEXT.get(self.description)

View file

@ -1,15 +1,18 @@
import re
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
from etl.epc_clean.utils import correct_spelling
class LightingAttributes:
class LightingAttributes(Definitions):
WELSH_TEXT = {
"goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets",
"dim goleuadau ynni-isel": "no low energy lighting",
"goleuadau ynni-isel ym mhob un o'r mannau gosod": 'Low energy lighting in all fixed outlets'
}
OBSERVED_ERRORS = []
def __init__(self, description, averages):
self.description: str = clean_description(description.lower())
@ -18,6 +21,9 @@ class LightingAttributes:
self.description = correct_spelling(self.description)
self.averages = averages
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
description in self.OBSERVED_ERRORS) or (description == "SAP05:Lighting")
def welsh_translation_search(self):
"""
For welsh text describing the percentage of low energy lighting, we match the regular
@ -40,6 +46,9 @@ class LightingAttributes:
description = self.description
if self.nodata:
return {"low_energy_proportion": None}
if 'no low energy lighting' in description:
return {"low_energy_proportion": 0}

View file

@ -77,7 +77,9 @@ class MainHeatAttributes(Definitions):
self.description: str = clean_description(self.description).strip()
# Remove special characters
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or (
description == "SAP05:Main-Heating"
)
translation = self.WELSH_TEXT.get(self.description)
if translation:
@ -97,11 +99,12 @@ class MainHeatAttributes(Definitions):
self.process_edge_cases()
if (not description or not any(
rt in self.description for rt in
self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
) and not self.is_edge_case):
raise ValueError('Invalid description')
if not self.nodata:
if (not description or not any(
rt in self.description for rt in
self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
) and not self.is_edge_case):
raise ValueError('Invalid description')
def process_edge_cases(self) -> (dict, bool):
"""

View file

@ -117,7 +117,9 @@ class MainheatControlAttributes(Definitions):
def __init__(self, description: str):
self.description: str = clean_description(description.lower()).strip()
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
description == "SAP05:Main-Heating-Controls"
)
translation = self.WELSH_TEXT.get(self.description)
if translation:

View file

@ -75,12 +75,19 @@ class WallAttributes(Definitions):
'insulation_thickness', 'external_insulation', 'internal_insulation'
]
CORRECTIONS = {
"Granite or whin, as built, no insulation (assumed)": "Granite or whinstone, as built, no insulation (assumed)",
}
def __init__(self, description: str):
"""
:param description: Description of the walls.
"""
self.description: str = description
if self.description in self.CORRECTIONS:
self.description = self.CORRECTIONS[self.description]
self.welsh_translation_search()
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES

View file

@ -38,7 +38,7 @@ class WindowAttributes(Definitions):
# In the case of an empty description, we want to return a dictionary with all values set to False
# and indicate there was no data
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description == "SAP05:Windows"
translation = self.WELSH_TEXT.get(self.description)
if translation:

View file

@ -2,8 +2,8 @@ import re
import string
from typing import Tuple, Union, Dict, List
THERMAL_TRANSMITTANCE_STR = r"average thermal transmittance (-?\d+(\.\d+)?)\s(w/m\S+k)"
THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTANCE_STR)
THERMAL_TRANSMITTANCE_STR = r"average thermal transmittance\s*[=:-]?\s*(-?\d+(\.\d+)?)\s*[wW]/m\S*[kK]"
THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTANCE_STR, re.IGNORECASE)
DOUBLE_SPACE_PATTERN = re.compile(r"\s+")