Merge pull request #309 from Hestia-Homes/stonewater

Stonewater
2026-06-08 11:17:27 +00:00 · 2024-07-01 15:30:01 +01:00 · 2024-07-01 15:30:01 +01:00 · 8b70a71241
commit 8b70a71241
parent 0875213779 51333ff31a
22 changed files with 719 additions and 214 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,10 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
+  <component name="PyCharmProfessionalAdvertiser">
+    <option name="shown" value="true" />
+  </component>
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/etl/customers/stonewater/map_app/Stonewater
+++ b/etl/customers/stonewater/map_app/Stonewater
--- a/etl/customers/stonewater/map_app/assets/hestia-logo.png
+++ b/etl/customers/stonewater/map_app/assets/hestia-logo.png
--- a/etl/customers/stonewater/map_app/assets/osmosis-Logo.svg
+++ b/etl/customers/stonewater/map_app/assets/osmosis-Logo.svg
--- a/etl/customers/stonewater/map_app/assets/stonewater-logo.png
+++ b/etl/customers/stonewater/map_app/assets/stonewater-logo.png
--- a/etl/customers/stonewater/map_app/callbacks.py
+++ b/etl/customers/stonewater/map_app/callbacks.py
--- a/etl/customers/stonewater/map_app/config.py
+++ b/etl/customers/stonewater/map_app/config.py
@ -0,0 +1,8 @@
+import os
+import json
+import dotenv
+
+# When running locally, we'll need to load the .env file
+dotenv.load_dotenv()
+
+MAPBOX_ACCESS_TOKEN = os.getenv("MAPBOX_ACCESS_TOKEN")
--- a/etl/customers/stonewater/map_app/map_page.py
+++ b/etl/customers/stonewater/map_app/map_page.py
@ -0,0 +1,230 @@
+import dash_bootstrap_components as dbc
+from dash import html, dcc
+import json
+import plotly.graph_objects as go
+import pandas as pd
+
+from config import MAPBOX_ACCESS_TOKEN
+
+
+def make_real_epc_piechart(real_epc_breakdown):
+    labels = [x["is_real_epc"] for x in real_epc_breakdown]
+    values = [x["count"] for x in real_epc_breakdown]
+
+    marker_colors = ["#027fa6", "rgb(225 225 225)"]
+
+    fig = go.Figure(
+        data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors)],
+    )
+
+    fig.update_layout(margin={"t": 0})
+
+    plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
+
+    return plot
+
+
+def make_epc_rating_piechart(epc_rating_breakdown):
+    # Re-order from G to A
+    epc_rating_breakdown = sorted(epc_rating_breakdown, key=lambda x: x["EPC"])
+
+    labels = [x["EPC"] for x in epc_rating_breakdown]
+    values = [x["count"] for x in epc_rating_breakdown]
+
+    marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
+
+    fig = go.Figure(
+        data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors, sort=False)],
+    )
+
+    fig.update_layout(margin={"t": 0})
+
+    plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
+
+    return plot
+
+
+def make_map(locations):
+    if not locations:
+        return None
+
+    df = pd.DataFrame(locations)
+
+    # Create custom hover text
+    df['hover_text'] = df.apply(
+        lambda row: f"UPRN: {int(row['uprn'])}<br>Address: {row['standardised_address']}<br>Postcode: "
+                    f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: {row['LONGITUDE']}",
+        axis=1)
+
+    data = [
+        go.Scattermapbox(
+            lat=df["LATITUDE"].tolist(),
+            lon=df["LONGITUDE"].tolist(),
+            mode="markers",
+            marker=go.scattermapbox.Marker(size=10, color="#027fa6"),
+            text=df["hover_text"],  # Use the custom hover text
+            hoverinfo='text'
+        )
+    ]
+
+    layout = go.Layout(
+        autosize=True,
+        hovermode="closest",
+        mapbox=go.layout.Mapbox(
+            accesstoken=MAPBOX_ACCESS_TOKEN,
+            bearing=0,
+            center=go.layout.mapbox.Center(lat=53, lon=-1.5),
+            pitch=0,
+            zoom=5,
+        ),
+        margin={"t": 0},
+    )
+
+    fig = go.Figure(data=data, layout=layout)
+
+    plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
+
+    return plot
+
+
+def layout():
+    # Get the data
+    with open("Stonewater Mapping Data.json", "r") as file:
+        locations = json.load(file)
+
+    # Get the EPC breakdown data
+    with open("Stonewater real EPC breakdown.json") as file:
+        real_epc_breakdown = json.load(file)
+
+    # Get the EPC ratings data
+    with open("Stonewater EPC rating breakdown.json") as file:
+        epc_rating_breakdown = json.load(file)
+
+    page = dbc.Container(
+        [
+            dbc.Row(
+                dbc.Col(
+                    html.Div(
+                        [
+                            # Banner with logos
+                            dbc.Row(
+                                [
+                                    dbc.Col(
+                                        html.Img(src="assets/stonewater-logo.png", height="50px"),
+                                        width="auto"
+                                    ),
+                                    dbc.Col(
+                                        html.Img(src="assets/osmosis-Logo.svg", height="50px"),
+                                        width="auto"
+                                    ),
+                                    dbc.Col(
+                                        html.Div(
+                                            style={"color": "white", "font-size": "1.5rem", "font-weight": "bold"}
+                                        ),
+                                        width=True,
+                                        className="text-center"
+                                    )
+                                ],
+                                className="align-items-center",
+                                style={"background-color": "#027fa6", "padding": "10px"}
+                            ),
+                            dbc.Row(
+                                [
+                                    dbc.Col("Powered by", style={"color": "#027fa6", "fontSize": "1rem", 'zIndex': 10},
+                                            width="auto"),
+                                    dbc.Col(
+                                        html.A(
+                                            html.Img(src="assets/hestia-logo.png", height="50px"),
+                                            href="https://hestia.homes",
+                                        ),
+                                        width="auto",
+                                        style={"margin-left": "-60px"}
+                                    ),
+                                ],
+                                justify='left',
+                                align="center"
+                            ),
+                            html.H1(
+                                "Stonewater Survey Map",
+                                style={"font-size": "2.5rem", "font-weight": "bold", "margin-bottom": "20px"}
+                            ),
+                            html.P(
+                                "This map shows the location of the properties that are to be surveyed by Osmosis.",
+                                style={"font-size": "1.25rem", "margin-bottom": "40px"}
+                            ),
+                        ],
+                        className="text-center"
+                    ),
+                    width=12
+                ),
+                className="mt-5"
+            ),
+            dbc.Row(
+                dbc.Col(
+                    make_map(locations=locations),
+                    width=10,
+                    align="center",
+                    className="text-center"
+                ),
+                justify="center"
+            ),
+            dbc.Row(
+                [
+                    dbc.Col(
+                        [
+                            html.Div(
+                                "Breakdown of real EPCs",
+                                style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
+                                className='text-center'
+                            ),
+                            html.Div(
+                                "This pie chart shows the proportion of real EPCs in the asset list. Currently, "
+                                "there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
+                                style={"marginBottom": "1em"}
+                            ),
+                            make_real_epc_piechart(real_epc_breakdown),
+                        ],
+                        width={"size": 5},
+                    ),
+                    dbc.Col(
+                        [
+                            html.Div(
+                                "EPC Ratings for properties with an EPC",
+                                style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
+                                className='text-center'
+                            ),
+                            html.Div(
+                                [
+                                    "This pie chart shows the breakdown of EPC ratings, for properties that currently "
+                                    "have an EPC. "
+                                    "The ratings range from A to G, where surprisingly, there are two EPC properties "
+                                    "that were initially "
+                                    "expected by Parity's modelled SAP, to be EPC D or below. These properties can be"
+                                    " seen ",
+                                    html.A("here",
+                                           href="https://find-energy-certificate.service.gov.uk/energy-certificate"
+                                                "/2708-5001-7327-6090-7284",
+                                           target="_blank"),
+                                    " and ",
+                                    html.A("here",
+                                           href="https://find-energy-certificate.service.gov.uk/energy-certificate"
+                                                "/1037-4032-1009-0361-7292",
+                                           target="_blank"),
+                                    "."
+                                ],
+                                style={"marginBottom": "1em"}
+                            ),
+                            make_epc_rating_piechart(epc_rating_breakdown),
+                        ],
+
+                        width={"size": 5},
+                    ),
+                ],
+                justify="center"
+            )
+        ],
+        fluid=True,
+        className="p-5"
+    )
+
+    return page
--- a/etl/customers/stonewater/map_app/requirements.txt
+++ b/etl/customers/stonewater/map_app/requirements.txt
@ -0,0 +1,12 @@
+dash==2.8.1
+gunicorn
+pandas
+dash-bootstrap-components==1.3.1
+boto3
+dropbox
+Flask-Caching
+dash-extensions
+mysql-connector-python
+sqlalchemy
+werkzeug==2.3.7
+python-dotenv
--- a/etl/customers/stonewater/map_app/server.py
+++ b/etl/customers/stonewater/map_app/server.py
@ -0,0 +1,45 @@
+import logging
+import secrets
+
+import dash_bootstrap_components as dbc
+from dash import html
+from dash_extensions.enrich import DashProxy, MultiplexerTransform
+import flask
+from map_page import layout
+
+logger = logging.getLogger(__name__)
+
+# We just use a simple secret key for the moment
+
+SECRET_KEY = secrets.token_hex(24)
+
+
+def init_app():
+    app = DashProxy(
+        __name__,
+        server=flask.Flask(__name__),
+        suppress_callback_exceptions=True,
+        external_stylesheets=[
+            dbc.themes.BOOTSTRAP,
+            dbc.icons.FONT_AWESOME,
+            "https://fonts.googleapis.com/css?family=Comfortaa",
+        ],
+        transforms=[MultiplexerTransform()]
+    )
+
+    server = app.server
+
+    # Set app config
+    server.config.update(
+        SECRET_KEY=SECRET_KEY,
+    )
+
+    app.title = "Hesta X Stonewater"
+
+    # Define the layout
+    app.layout = layout()
+
+    return app
+
+
+app = init_app()
--- a/etl/customers/stonewater/map_app/wsgi.py
+++ b/etl/customers/stonewater/map_app/wsgi.py
@ -0,0 +1,8 @@
+# Callbacks must be imported to run the app
+import callbacks  # NOQA
+from server import app
+
+application = app.server
+
+if __name__ == "__main__":
+    app.run_server(port=8080, debug=True, host="0.0.0.0")
--- a/etl/customers/stonewater/outputs
+++ b/etl/customers/stonewater/outputs
@ -0,0 +1,132 @@
+"""
+This script prepares some outputs for the stonewater project, 27th June 2024
+
+The work done so far has been data cleaning and clustering.
+In this script, we do the following things:
+
+1) Match the clustering data to the archetypes
+2) Do some basic analysis on the data
+3) Mapping of the archetypes
+"""
+import pandas as pd
+import json
+from utils.s3 import read_pickle_from_s3
+
+stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
+archetyped_asset_list = stonewater_asset_list[
+    [
+        "internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
+        "archetype_representative", "rank"
+    ]
+].copy()
+archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
+archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
+# Sort
+archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
+
+# Read in and merge on clustering features
+clustering_features = read_pickle_from_s3(
+    bucket_name="retrofit-data-dev",
+    s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
+)
+
+# Move property-type and built-form to the first two columns
+columns_to_move = ['property-type', 'built-form']
+
+# Get the remaining columns
+remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
+
+# Create the new column order
+new_column_order = columns_to_move + remaining_columns
+
+# Reorder the DataFrame
+clustering_features = clustering_features[new_column_order]
+
+archetyped_asset_list = archetyped_asset_list.merge(
+    clustering_features,
+    on="internal_id",
+    how="inner"
+)
+
+archetyped_asset_list = archetyped_asset_list.rename(
+    columns={
+        "internal_id": "Osm. ID",
+        "customer_asset_id": "Org. ref.",
+        "external_address_id": "Address ID",
+        "cluster": "Archetype ID",
+        "archetype_representative": "Archetype Representative",
+        "rank": "Archetype Group Rank",
+    }
+)
+archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
+# Create an extract of the features
+
+
+# Look at number of combinations
+# - If we look at the number of combinations of property type & built form, we have 25 unique combinations
+# - If we look at the number of combinations of property type, built form, and walls description, this jumps
+# massively to 237 unique combinations
+# - Adding roof description to the mix, we have 857 unique combinations
+# - Adding floor description, we have 1278 unique combinations
+# This doesn't even begin to consider the other variables that we have in the dataset, such as the property dimensions,
+# location, and other factors.
+# Ideally, we would perfectly separate these variables but this is not possible, given the constraint of needing ~450
+# archetypes. We will need to make some compromises here. This is where a clustering algorithm can help us.
+# We don't end up with perfect separation but we can get a good enough separation to make the archetypes useful, and can
+# base the archetypes on a number of energy performance metrics, as well as location and other factors.
+# archetyped_asset_list[
+#     ["property-type", "built-form", "walls-description", "roof-description",
+#      "floor-description"]].drop_duplicates().shape
+
+# Save this as an excel
+# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
+
+# We store the location data, which will be used for the mapping. We just need the longitude and latitude
+mapping_data = stonewater_asset_list[
+    stonewater_asset_list["archetype_representative"]
+][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
+
+mapping_data = mapping_data.merge(
+    clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
+)
+mapping_data = mapping_data.drop(columns=["internal_id"])
+
+with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w") as f:
+    f.write(json.dumps(mapping_data.to_dict(orient="records")))
+
+# We also include some data for visualising the breakdown of EPCS
+proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
+# Invert the true and false
+proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
+proportion_of_real_epcs = proportion_of_real_epcs.rename(
+    columns={"estimated": "is_real_epc"}
+)
+
+with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
+    f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
+
+# Produce the breakdown of EPC ratings
+epc_rating_breakdown = (
+    clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
+    .value_counts()
+    .to_frame()
+    .reset_index()
+)
+
+epc_rating_breakdown = epc_rating_breakdown.rename(
+    columns={"current-energy-rating": "EPC"}
+)
+
+with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
+    f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
+
+epc_a_properties = clustering_features[
+    (clustering_features["current-energy-rating"] == "A")
+    & (~clustering_features["estimated"])
+    ]
+
+epc_a_properties = epc_a_properties.merge(
+    stonewater_asset_list,
+    on="internal_id",
+    how="inner"
+)
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -14,6 +14,11 @@ import pandas as pd
 import time
 from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
    save_dataframe_to_s3_parquet, save_pickle_to_s3
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from scipy.spatial.distance import cdist

 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -673,7 +678,8 @@ def compile_data():
    # )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})

    asset_list = pd.read_excel(
-        "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+        header=4
    )

    udprn_data = pd.read_excel(
@ -1090,6 +1096,26 @@ def concatenate_row(row):
    return ', '.join(row.dropna().replace('', None).dropna().astype(str))


+def adjust_clusters(cluster_allocation, total_clusters):
+    current_total = sum(cluster_allocation.values())
+    adjustment = total_clusters - current_total
+    if adjustment > 0:
+        # Increase clusters, start from the largest group
+        for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
+            cluster_allocation[group] += 1
+            adjustment -= 1
+            if adjustment == 0:
+                break
+    elif adjustment < 0:
+        # Decrease clusters, start from the largest group
+        for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
+            cluster_allocation[group] -= 1
+            adjustment += 1
+            if adjustment == 0:
+                break
+    return cluster_allocation
+
+
 def compile_data_final():
    # Updated version:

@ -1103,7 +1129,8 @@ def compile_data_final():
    ########################################################################

    asset_list = pd.read_excel(
-        "/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+        header=4
    )

    udprn_data = pd.read_excel(
@ -1633,124 +1660,243 @@ def compile_data_final():
    # )

    # from utils.s3 import read_pickle_from_s3
-    # data = read_pickle_from_s3(
+    # property_attributes = read_pickle_from_s3(
    #     bucket_name="retrofit-data-dev",
    #     s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
    # )

+    # We perform some additional cleaning on the data
+    import msgpack
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+    from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
+    from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
+    from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
+    from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
+    from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
+    from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+    from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
+    from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
+    from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
+
+    cleaners = {
+        "floor-description": FloorAttributes,
+        'hotwater-description': HotWaterAttributes,
+        'main-fuel': MainFuelAttributes,
+        'mainheat-description': MainHeatAttributes,
+        'mainheatcont-description': MainheatControlAttributes,
+        'roof-description': RoofAttributes,
+        'walls-description': WallAttributes,
+        'windows-description': WindowAttributes,
+        'lighting-description': LightingAttributes
+    }
+
+    for variable_to_clean in cleaned.keys():
+
+        unique_descriptions = property_attributes[variable_to_clean].unique()
+        clean_df = pd.DataFrame(cleaned[variable_to_clean])
+        # Check if we have any
+        missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
+        if missed:
+            descriptions_to_append = []
+            for description in missed:
+                if variable_to_clean == "lighting-description":
+                    cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
+                else:
+                    cln = cleaners[variable_to_clean](description)
+                to_append = {
+                    "original_description": description,
+                    "clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
+                    **cln.process()
+                }
+                descriptions_to_append.append(to_append)
+
+            descriptions_to_append = pd.DataFrame(descriptions_to_append)
+            clean_df = pd.concat([clean_df, descriptions_to_append])
+
+        clean_df = clean_df.rename(
+            columns={
+                "thermal_transmittance": f"{variable_to_clean}_thermal_transmittance",
+                "is_assumed": f"{variable_to_clean}_is_assumed",
+            }
+        )
+
+        if 'thermal_transmittance_unit' in clean_df.columns:
+            clean_df = clean_df.drop(columns=['thermal_transmittance_unit'])
+
+        starting_size = len(property_attributes)
+        property_attributes = property_attributes.merge(
+            clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
+        )
+        if starting_size != property_attributes.shape[0]:
+            raise Exception("something went wrong")
+        property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
+        # Fill missings
+        for k in clean_df.columns:
+            if k in property_attributes.columns:
+                property_attributes[k] = property_attributes[k].fillna("missing")
+
+    # We group some variables such as thermal transmittance for walls, roof, floors
+    # ranges = {
+    #     "< 0.1": (0, 0.1),
+    #     "0.1 - 0.3": (0.1, 0.3),
+    #     "0.3 - 0.5": (0.3, 0.5),
+    #     "0.5 - 0.7": (0.5, 0.7),
+    #     "0.9 - 1": (0.9, 1),
+    #     "1 - 1.5": (1, 1.5),
+    #     "1.5 - 2": (1.5, 2),
+    #     "2+": (2, 2.5)
+    # }
+
+    ranges = {
+        "< 0.1": (0, 0.1),
+        "0.1 - 0.3": (0.1, 0.3),
+        "0.3 - 0.5": (0.3, 0.5),
+        "0.5+": (0.5, 2.5),
+    }
+
+    # Generate the lookup table
+    thermal_transmittance_lookup_table = []
+    for i in range(1, 251):
+        value = i / 100
+        for label, (low, high) in ranges.items():
+            if low < value <= high:
+                thermal_transmittance_lookup_table.append({"from": value, "to": label})
+                break
+
+    # Convert to DataFrame for display
+    thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
+    thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
+
+    thermal_transmittance_cols = [
+        c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c
+    ]
+    for i, col in enumerate(thermal_transmittance_cols):
+        # Perform the mapping
+        to_col = f"to_{col}"
+        property_attributes[col] = property_attributes[col].astype(str)
+        property_attributes = property_attributes.merge(
+            thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
+            how="left",
+            left_on=col,
+            right_on="from",
+            suffixes=("", f"_{i}")
+        )
+        property_attributes = property_attributes.drop(columns=["from", col])
+        property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
+
+    # Drop the description columns that are the keys in cleaned
+    print("PUT ME BACK!!??")
+    property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
+    # Perform the mapping
+
    # CLUSTERING!!
-
-    # from sklearn.cluster import KMeans
-    # from sklearn.preprocessing import OneHotEncoder
-    # from scipy.spatial.distance import cdist
-    #
-    # property_attributes.set_index('internal_id', inplace=True)
-    #
-    # # Step 1: Prepare the data
-    # # Identify categorical columns (you might need to adjust this)
-    # categorical_cols = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
-    # for col in categorical_cols:
-    #     property_attributes[col] = property_attributes[col].astype(str)
-    #
-    # # Applying OneHotEncoder
-    # encoder = OneHotEncoder(sparse=False)
-    # encoded_cats = encoder.fit_transform(property_attributes[categorical_cols])
-    #
-    # # Creating a new DataFrame with encoded categorical data and original numerical data
-    # numerical_data = property_attributes.select_dtypes(include=[np.number])
-    # data_for_clustering = pd.concat([numerical_data, pd.DataFrame(encoded_cats, index=numerical_data.index)], axis=1)
-    #
-    # # Convert all column names to strings to satisfy KMeans requirements
-    # data_for_clustering.columns = data_for_clustering.columns.astype(str)
-    #
-    # # Step 2: K-Means Clustering
-    # k = 450  # number of clusters
-    # kmeans = KMeans(n_clusters=k, random_state=0)
-    # property_attributes['cluster'] = kmeans.fit_predict(data_for_clustering)
-    #
-    # # Extracting centroids
-    # centroids = kmeans.cluster_centers_
-    #
-    # # Step 3: Assign clusters and rank rows
-    # # Calculating distances from each point to its cluster's centroid
-    # distances = cdist(data_for_clustering, centroids, 'euclidean')
-    # min_distances = distances.min(axis=1)
-    # property_attributes['distance_to_centroid'] = min_distances
-    #
-    # # Ranking rows by distance within each cluster
-    # property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(method='first')
-    #
-    # # Sorting to verify
-    # property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
-    #
-    # # Optional: Displaying the dataframe
-    # print(property_attributes.head())
-
-    from sklearn.cluster import KMeans
-    from sklearn.preprocessing import StandardScaler, OneHotEncoder
-    from sklearn.compose import ColumnTransformer
-    from sklearn.pipeline import Pipeline
-    from scipy.spatial.distance import cdist
-    id_column = 'internal_id'
-    property_attributes.set_index(id_column, inplace=True)
+    grouping_columns = [
+        'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
+    ]

    # Define the preprocessing for numerical and categorical features
    numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
+    categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]

    for col in categorical_features:
        property_attributes[col] = property_attributes[col].astype(str)

-    preprocessor = ColumnTransformer(
-        transformers=[
-            ('num', StandardScaler(), numerical_features),
-            ('cat', OneHotEncoder(), categorical_features)
+    id_column = 'internal_id'
+    n_clusters = 450
+    random_state = 0
+
+    training_data_grouped = property_attributes.groupby(grouping_columns)
+    group_sizes = {name: len(group) for name, group in training_data_grouped}
+    total_size = sum(group_sizes.values())
+    cluster_allocation = {
+        name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
+    }
+
+    # Adjust cluster allocation to ensure total clusters sum to 450
+    cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
+
+    # TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to
+    #       collect the results of the clustering and then perform the transformations afterwards
+
+    final_clusters = []
+    for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
+
+        group_n_clusters = cluster_allocation[group_variables]
+        group_data.set_index(id_column, inplace=True)
+
+        preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', StandardScaler(), numerical_features),
+                ('cat', OneHotEncoder(), categorical_features)
+            ]
+        )
+
+        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
+                                   ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
+
+        # Fit the pipeline to the data
+        pipeline.fit(group_data)
+
+        # Transform the data using the fitted pipeline
+        processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
+
+        # Get cluster labels
+        group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
+
+        # Get centroids (already in the same transformed space)
+        centroids = pipeline.named_steps['kmeans'].cluster_centers_
+
+        # if the data isn't an array, make it one
+        if not isinstance(processed_data, np.ndarray):
+            processed_data = processed_data.toarray()
+
+        # Calculate distances from each point to the centroid of its cluster
+        distances_to_centroids = [
+            cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
+            for i, label in enumerate(group_data['cluster'])
        ]
-    )

-    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
-                               ('kmeans', KMeans(n_clusters=450, random_state=0))])
+        group_data['distance_to_centroid'] = distances_to_centroids

-    # Fit the pipeline to the data
-    pipeline.fit(property_attributes)
+        # for cluster_id in group_data['cluster'].unique():
+        #     cluster_data = group_data[group_data['cluster'] == cluster_id]
+        #     min_distance = cluster_data['distance_to_centroid'].min()
+        #     print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
+        #     if min_distance != 0:
+        #         print(f"No point with zero distance found in cluster {cluster_id}")

-    # Transform the data using the fitted pipeline
-    processed_data = pipeline.named_steps['preprocessor'].transform(property_attributes)
+        # Ranking rows by distance within each cluster
+        group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')

-    # Get cluster labels
-    property_attributes['cluster'] = pipeline.named_steps['kmeans'].labels_
+        # Sorting to verify
+        group_data.sort_values(by=['cluster', 'rank'], inplace=True)
+        group_data.reset_index(inplace=True)

-    # Get centroids (already in the same transformed space)
-    centroids = pipeline.named_steps['kmeans'].cluster_centers_
+        to_append = group_data[["internal_id", "cluster", "rank"]].copy()
+        to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
+        final_clusters.append(to_append)

-    processed_data = processed_data.toarray()
+    final_clusters = pd.concat(final_clusters)
+    # remap the clusters from the current names to 1 -> n_clusters

-    # Calculate distances from each point to the centroid of its cluster
-    distances_to_centroids = [
-        cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
-        for i, label in enumerate(property_attributes['cluster'])
-    ]
-
-    property_attributes['distance_to_centroid'] = distances_to_centroids
-
-    for cluster_id in property_attributes['cluster'].unique():
-        cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
-        min_distance = cluster_data['distance_to_centroid'].min()
-        print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
-        if min_distance != 0:
-            print(f"No point with zero distance found in cluster {cluster_id}")
-
-    # Ranking rows by distance within each cluster
-    property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(
-        method='first')
-
-    # Sorting to verify
-    property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
+    cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
+    final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
+    final_clusters["cluster"] = final_clusters["cluster"].astype(str)

    ################################################
    # Prepare outputs!!!!
    ################################################
+
    property_attributes.reset_index(inplace=True)
+    property_attributes = property_attributes.merge(
+        final_clusters, how="left", on="internal_id"
+    )
    property_attributes["archetype_representative"] = property_attributes["rank"] == 1

    asset_list_with_archetypes = asset_list.merge(
@ -1769,7 +1915,7 @@ def compile_data_final():
    asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
        "archetype_representative"].fillna(False)

-    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
+    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False)

    stonewater_uprn_lookup = asset_list_with_archetypes[
        ["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
@ -1777,110 +1923,6 @@ def compile_data_final():

    stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")

-    ################################################
-    # Agglomertive Clustering
-    ################################################
-
-    # from sklearn.cluster import KMeans, AgglomerativeClustering
-    # from sklearn.preprocessing import StandardScaler, OneHotEncoder
-    # from sklearn.compose import ColumnTransformer
-    # from sklearn.pipeline import Pipeline
-    # from scipy.spatial.distance import cdist
-    # import numpy as np
-    # from collections import Counter
-    #
-    # id_column = 'internal_id'
-    # property_attributes.set_index(id_column, inplace=True)
-    #
-    # # Define the preprocessing for numerical and categorical features
-    # numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
-    # categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
-    #
-    # for col in categorical_features:
-    #     property_attributes[col] = property_attributes[col].astype(str)
-    #
-    # preprocessor = ColumnTransformer(
-    #     transformers=[
-    #         ('num', StandardScaler(), numerical_features),
-    #         ('cat', OneHotEncoder(sparse_output=False), categorical_features)
-    #     ]
-    # )
-    #
-    # # Function to perform clustering and merge small clusters
-    # def cluster_with_min_size(data, preprocessor, n_clusters=10, min_size=5):
-    #     while True:
-    #         # Preprocess the data
-    #         processed_data = preprocessor.fit_transform(data)
-    #
-    #         # Initial clustering
-    #         clustering = AgglomerativeClustering(n_clusters=n_clusters)
-    #         labels = clustering.fit_predict(processed_data)
-    #
-    #         # Check cluster sizes
-    #         cluster_counts = Counter(labels)
-    #
-    #         # Find clusters smaller than min_size
-    #         small_clusters = {cluster for cluster, count in cluster_counts.items() if count < min_size}
-    #
-    #         if not small_clusters:
-    #             break
-    #
-    #         # Merge small clusters
-    #         for cluster in small_clusters:
-    #             # Find the nearest cluster to merge with
-    #             cluster_data = processed_data[labels == cluster]
-    #             other_clusters = [i for i in range(n_clusters) if i not in small_clusters]
-    #             other_cluster_data = [processed_data[labels == i] for i in other_clusters]
-    #             other_centroids = np.vstack([data.mean(axis=0) for data in other_cluster_data])
-    #
-    #             distances = cdist(cluster_data, other_centroids).mean(axis=0)
-    #             closest_cluster = other_clusters[np.argmin(distances)]
-    #
-    #             labels[labels == cluster] = closest_cluster
-    #
-    #         n_clusters -= len(small_clusters)
-    #
-    #     return labels
-    #
-    # # Perform clustering with minimum size constraint
-    # n_clusters = 10
-    # min_size = 5
-    # property_attributes['cluster'] = cluster_with_min_size(property_attributes, preprocessor, n_clusters, min_size)
-    #
-    # # Filter out empty clusters
-    # valid_clusters = property_attributes['cluster'].unique()
-    #
-    # # Get centroids for the resulting clusters
-    # processed_data = preprocessor.transform(property_attributes.drop(columns=["cluster"]))
-    # centroids = np.vstack([processed_data[property_attributes['cluster'] == i].mean(axis=0) for i in valid_clusters])
-    #
-    # # Calculate distances from each point to the centroid of its cluster
-    # distances_to_centroids = [
-    #     cdist(processed_data[i].reshape(1, -1),
-    #           centroids[valid_clusters.tolist().index(label)].reshape(1, -1)).flatten()[0]
-    #     for i, label in enumerate(property_attributes['cluster'])
-    # ]
-    #
-    # property_attributes['distance_to_centroid'] = distances_to_centroids
-    #
-    # # Verify that at least one point in each cluster has zero distance to the centroid
-    # for cluster_id in valid_clusters:
-    #     cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
-    #     min_distance = cluster_data['distance_to_centroid'].min()
-    #     print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
-    #     if min_distance != 0:
-    #         print(f"No point with zero distance found in cluster {cluster_id}")
-    #
-    # # Rank the distances within each cluster
-    # property_attributes['rank_within_cluster'] = property_attributes.groupby('cluster')['distance_to_centroid'] \
-    #     .rank(method='first')
-    #
-    # # Reset index to get 'internal_id' back
-    # property_attributes.reset_index(inplace=True)
-    #
-    # # Display the DataFrame
-    # print(property_attributes)
-

 def pull_ideal_postcodes(missing_uprn_with_udprn):
    api_key = ""  # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
--- a/etl/epc_clean/epc_attributes/FloorAttributes.py
+++ b/etl/epc_clean/epc_attributes/FloorAttributes.py
@ -38,7 +38,7 @@ class FloorAttributes(Definitions):
        self.description: str = description.lower()

        self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
-            description in self.OBSERVED_ERRORS)
+            description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor")

        # Try and perform a translation, incase it's in welsh
        self.translate_welsh_text()
--- a/etl/epc_clean/epc_attributes/HotWaterAttributes.py
+++ b/etl/epc_clean/epc_attributes/HotWaterAttributes.py
@ -129,7 +129,9 @@ class HotWaterAttributes(Definitions):
    def __init__(self, description: str):
        self.description: str = clean_description(description.lower()).strip()

-        self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
+        self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
+            self.description == "sap05 hot-water"
+        )

        translation = self.WELSH_TEXT.get(self.description)

--- a/etl/epc_clean/epc_attributes/LightingAttributes.py
+++ b/etl/epc_clean/epc_attributes/LightingAttributes.py
@ -1,15 +1,18 @@
 import re
+from BaseUtility import Definitions
 from etl.epc_clean.epc_attributes.attribute_utils import clean_description
 from etl.epc_clean.utils import correct_spelling


-class LightingAttributes:
+class LightingAttributes(Definitions):
    WELSH_TEXT = {
        "goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets",
        "dim goleuadau ynni-isel": "no low energy lighting",
        "goleuadau ynni-isel ym mhob un o'r mannau gosod": 'Low energy lighting in all fixed outlets'
    }

+    OBSERVED_ERRORS = []
+
    def __init__(self, description, averages):
        self.description: str = clean_description(description.lower())

@ -18,6 +21,9 @@ class LightingAttributes:
        self.description = correct_spelling(self.description)
        self.averages = averages

+        self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
+            description in self.OBSERVED_ERRORS) or (description == "SAP05:Lighting")
+
    def welsh_translation_search(self):
        """
        For welsh text describing the percentage of low energy lighting, we match the regular
@ -40,6 +46,9 @@ class LightingAttributes:

        description = self.description

+        if self.nodata:
+            return {"low_energy_proportion": None}
+
        if 'no low energy lighting' in description:
            return {"low_energy_proportion": 0}

--- a/etl/epc_clean/epc_attributes/MainheatAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatAttributes.py
@ -77,7 +77,9 @@ class MainHeatAttributes(Definitions):

        self.description: str = clean_description(self.description).strip()
        # Remove special characters
-        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
+        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or (
+            description == "SAP05:Main-Heating"
+        )

        translation = self.WELSH_TEXT.get(self.description)
        if translation:
@ -97,11 +99,12 @@ class MainHeatAttributes(Definitions):

        self.process_edge_cases()

-        if (not description or not any(
-            rt in self.description for rt in
-            self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
-        ) and not self.is_edge_case):
-            raise ValueError('Invalid description')
+        if not self.nodata:
+            if (not description or not any(
+                rt in self.description for rt in
+                self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
+            ) and not self.is_edge_case):
+                raise ValueError('Invalid description')

    def process_edge_cases(self) -> (dict, bool):
        """
--- a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py
@ -117,7 +117,9 @@ class MainheatControlAttributes(Definitions):

    def __init__(self, description: str):
        self.description: str = clean_description(description.lower()).strip()
-        self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
+        self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
+            description == "SAP05:Main-Heating-Controls"
+        )

        translation = self.WELSH_TEXT.get(self.description)
        if translation:
--- a/etl/epc_clean/epc_attributes/WallAttributes.py
+++ b/etl/epc_clean/epc_attributes/WallAttributes.py
@ -75,12 +75,19 @@ class WallAttributes(Definitions):
        'insulation_thickness', 'external_insulation', 'internal_insulation'
    ]

+    CORRECTIONS = {
+        "Granite or whin, as built, no insulation (assumed)": "Granite or whinstone, as built, no insulation (assumed)",
+    }
+
    def __init__(self, description: str):
        """
        :param description: Description of the walls.
        """
        self.description: str = description

+        if self.description in self.CORRECTIONS:
+            self.description = self.CORRECTIONS[self.description]
+
        self.welsh_translation_search()

        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
--- a/etl/epc_clean/epc_attributes/WindowAttributes.py
+++ b/etl/epc_clean/epc_attributes/WindowAttributes.py
@ -38,7 +38,7 @@ class WindowAttributes(Definitions):

        # In the case of an empty description, we want to return a dictionary with all values set to False
        # and indicate there was no data
-        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
+        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description == "SAP05:Windows"

        translation = self.WELSH_TEXT.get(self.description)
        if translation:
--- a/etl/epc_clean/epc_attributes/attribute_utils.py
+++ b/etl/epc_clean/epc_attributes/attribute_utils.py
@ -2,8 +2,8 @@ import re
 import string
 from typing import Tuple, Union, Dict, List

-THERMAL_TRANSMITTANCE_STR = r"average thermal transmittance (-?\d+(\.\d+)?)\s(w/m\S+k)"
-THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTANCE_STR)
+THERMAL_TRANSMITTANCE_STR = r"average thermal transmittance\s*[=:-]?\s*(-?\d+(\.\d+)?)\s*[wW]/m\S*[kK]"
+THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTANCE_STR, re.IGNORECASE)

 DOUBLE_SPACE_PATTERN = re.compile(r"\s+")