Merge pull request #327 from Hestia-Homes/main

Pausing solar api, working on survey extraction
2026-06-08 11:17:27 +00:00 · 2024-07-24 16:57:16 +01:00 · 2024-07-24 16:57:16 +01:00 · 6e714127c6
commit 6e714127c6
parent ddd061f7df 1d642e71e3
19 changed files with 1072 additions and 142 deletions
--- a/backend/Property.py
+++ b/backend/Property.py
@ -350,8 +350,21 @@ class Property:
                r for r in property_representative_recommendations
                if r["phase"] <= phase
            ]
-            epc_transformations = [x["description_simulation"] for x in represenative_recs_to_this_phase]
-            
+
+            # TODO: This is placeholder, but it's to handle the case of having both internal and external wall
+            #       insulation as options. This will cause the process below to fall over, so we take just
+            #       external wall insulation in epc_transformations, if we have both
+            types = [
+                x["type"] for x in represenative_recs_to_this_phase
+            ]
+            if "external_wall_insulation" in types and "internal_wall_insulation" in types:
+                epc_transformations = [
+                    x["description_simulation"] for x in represenative_recs_to_this_phase if
+                    x["type"] != "internal_wall_insulation"
+                ]
+            else:
+                epc_transformations = [x["description_simulation"] for x in represenative_recs_to_this_phase]
+
            # It is possible that we could have two simulations applied to the same descriptions
            # We extract these out
            phase_epc_transformation = {}
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@ -488,13 +488,6 @@ async def trigger_plan(body: PlanTriggerRequest):
                     "carbon_ending"]
        )

-        from utils.s3 import save_dataframe_to_s3_parquet
-        save_dataframe_to_s3_parquet(
-            bucket_name="retrofit-datalake-dev",
-            file_key="recommendations_scoring_data_11th_july.parquet",
-            df=recommendations_scoring_data
-        )
-
        model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at)

        all_predictions = model_api.predictions_template()
@ -510,8 +503,6 @@ async def trigger_plan(body: PlanTriggerRequest):
            for key, scored in predictions_dict.items():
                all_predictions[key] = pd.concat([all_predictions[key], scored])

-        prediction_df = all_predictions["heating_cost_predictions"]
-
        # Insert the predictions into the recommendations and run the optimiser
        # TODO: If a recommendation has a negative impact on SAP, we should remove it - this seems to have become a
        #       possibility with heating system
--- a/etl/init.py
+++ b/etl/init.py
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@ -133,7 +133,7 @@ def app():
    energy_consumption_data = []
    for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
        # Skip the first 50
-        if i < 127:
+        if i < 250:
            continue

        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
--- a/etl/customers/stonewater/map_app/Stonewater
+++ b/etl/customers/stonewater/map_app/Stonewater
@ -1 +1 @@
-[{"EPC": "D", "count": 1718}, {"EPC": "C", "count": 1343}, {"EPC": "E", "count": 538}, {"EPC": "F", "count": 80}, {"EPC": "B", "count": 52}, {"EPC": "G", "count": 3}, {"EPC": "A", "count": 2}]
+[{"EPC": "D", "count": 332}, {"EPC": "C", "count": 68}, {"EPC": "E", "count": 44}, {"EPC": "F", "count": 6}]
--- a/etl/customers/stonewater/map_app/Stonewater
+++ b/etl/customers/stonewater/map_app/Stonewater
--- a/etl/customers/stonewater/map_app/Stonewater
+++ b/etl/customers/stonewater/map_app/Stonewater
@ -1 +1 @@
-[{"is_real_epc": true, "count": 3736}, {"is_real_epc": false, "count": 1509}]
+[{"index": true, "is_real_epc": 3736}, {"index": false, "is_real_epc": 1509}]
--- a/etl/customers/stonewater/map_app/map_page.py
+++ b/etl/customers/stonewater/map_app/map_page.py
@ -31,7 +31,8 @@ def make_epc_rating_piechart(epc_rating_breakdown):
    labels = [x["EPC"] for x in epc_rating_breakdown]
    values = [x["count"] for x in epc_rating_breakdown]

-    marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
+    # marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
+    marker_colors = ["#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]

    fig = go.Figure(
        data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors, sort=False)],
@ -53,7 +54,10 @@ def make_map(locations):
    # Create custom hover text
    df['hover_text'] = df.apply(
        lambda row: f"UPRN: {int(row['uprn'])}<br>Address: {row['standardised_address']}<br>Postcode: "
-                    f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: {row['LONGITUDE']}",
+                    f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: "
+                    f"{row['LONGITUDE']}<br>Walls: {row['Walls']}<br>Roofs: {row['Roofs']}<br>Main Fuel: "
+                    f"{row['Main Fuel']}<br>Heating: {row['Heating']}<br>Age: {row['Age']}<br>Property Type: "
+                    f"{row['Property Type']}",
        axis=1)

    data = [
@ -93,8 +97,8 @@ def layout():
        locations = json.load(file)

    # Get the EPC breakdown data
-    with open("Stonewater real EPC breakdown.json") as file:
-        real_epc_breakdown = json.load(file)
+    # with open("Stonewater real EPC breakdown.json") as file:
+    #     real_epc_breakdown = json.load(file)

    # Get the EPC ratings data
    with open("Stonewater EPC rating breakdown.json") as file:
@ -149,7 +153,8 @@ def layout():
                                style={"font-size": "2.5rem", "font-weight": "bold", "margin-bottom": "20px"}
                            ),
                            html.P(
-                                "This map shows the location of the properties that are to be surveyed by Osmosis.",
+                                "This map shows the location of the properties that are to be surveyed by Osmosis. "
+                                "These properties span across 30 counties and 155 postal regions",
                                style={"font-size": "1.25rem", "margin-bottom": "40px"}
                            ),
                        ],
@ -170,22 +175,22 @@ def layout():
            ),
            dbc.Row(
                [
-                    dbc.Col(
-                        [
-                            html.Div(
-                                "Breakdown of real EPCs",
-                                style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
-                                className='text-center'
-                            ),
-                            html.Div(
-                                "This pie chart shows the proportion of real EPCs in the asset list. Currently, "
-                                "there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
-                                style={"marginBottom": "1em"}
-                            ),
-                            make_real_epc_piechart(real_epc_breakdown),
-                        ],
-                        width={"size": 5},
-                    ),
+                    # dbc.Col(
+                    #     [
+                    #         html.Div(
+                    #             "Breakdown of real EPCs",
+                    #             style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
+                    #             className='text-center'
+                    #         ),
+                    #         html.Div(
+                    #             "This pie chart shows the proportion of real EPCs in the asset list. Currently, "
+                    #             "there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
+                    #             style={"marginBottom": "1em"}
+                    #         ),
+                    #         make_real_epc_piechart(real_epc_breakdown),
+                    #     ],
+                    #     width={"size": 5},
+                    # ),
                    dbc.Col(
                        [
                            html.Div(
@ -195,22 +200,9 @@ def layout():
                            ),
                            html.Div(
                                [
-                                    "This pie chart shows the breakdown of EPC ratings, for properties that currently "
-                                    "have an EPC. "
-                                    "The ratings range from A to G, where surprisingly, there are two EPC properties "
-                                    "that were initially "
-                                    "expected by Parity's modelled SAP, to be EPC D or below. These properties can be"
-                                    " seen ",
-                                    html.A("here",
-                                           href="https://find-energy-certificate.service.gov.uk/energy-certificate"
-                                                "/2708-5001-7327-6090-7284",
-                                           target="_blank"),
-                                    " and ",
-                                    html.A("here",
-                                           href="https://find-energy-certificate.service.gov.uk/energy-certificate"
-                                                "/1037-4032-1009-0361-7292",
-                                           target="_blank"),
-                                    "."
+                                    "This pie chart shows the breakdown of expected and real EPC ratings, "
+                                    "for properties "
+                                    "that have been selected for sample",
                                ],
                                style={"marginBottom": "1em"}
                            ),
--- a/etl/customers/stonewater/outputs
+++ b/etl/customers/stonewater/outputs
@ -11,8 +11,9 @@ In this script, we do the following things:
 import pandas as pd
 import json
 from utils.s3 import read_pickle_from_s3
+from backend.app.utils import sap_to_epc

-stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
+stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V3.csv")
 archetyped_asset_list = stonewater_asset_list[
    [
        "internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
@ -25,28 +26,15 @@ archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
 archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])

 # Read in and merge on clustering features
-clustering_features = read_pickle_from_s3(
-    bucket_name="retrofit-data-dev",
-    s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
+clustering_features = pd.read_csv(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv",
 )

-# Move property-type and built-form to the first two columns
-columns_to_move = ['property-type', 'built-form']
-
-# Get the remaining columns
-remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
-
-# Create the new column order
-new_column_order = columns_to_move + remaining_columns
-
-# Reorder the DataFrame
-clustering_features = clustering_features[new_column_order]
-
 archetyped_asset_list = archetyped_asset_list.merge(
-    clustering_features,
-    on="internal_id",
-    how="inner"
-)
+    clustering_features.drop(columns=['uprn', 'Address ID', "rank", "cluster", "archetype_representative"]),
+    left_on="internal_id",
+    right_on="Osm. ID"
+).drop(columns=["Osm. ID"])

 archetyped_asset_list = archetyped_asset_list.rename(
    columns={
@ -82,12 +70,47 @@ archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
 # archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)

 # We store the location data, which will be used for the mapping. We just need the longitude and latitude
+stonewater_asset_list["uprn"] = stonewater_asset_list["uprn"].astype('Int64')
+
 mapping_data = stonewater_asset_list[
    stonewater_asset_list["archetype_representative"]
-][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
+][["internal_id", "uprn", "standardised_address", "standardised_postcode"]].merge(
+    archetyped_asset_list[["uprn", "Walls", "Roofs", "Main Fuel", "Heating", "Age", "Property Type"]],
+    how="left",
+    on="uprn"
+)
+
+# We need to merge on longitude and latitude
+spatial_data_to_uprn = read_pickle_from_s3(
+    s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+    bucket_name="retrofit-data-dev"
+)
+
+
+# Function to convert specific columns to bool dtype
+def convert_specific_columns_to_bool(df, columns):
+    for column in columns:
+        if column in df.columns:
+            df[column] = df[column].astype(bool)
+    return df
+
+
+spatial_data_to_uprn = [convert_specific_columns_to_bool(
+    df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
+) for df in spatial_data_to_uprn]
+
+spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
+spatial_data_to_uprn = spatial_data_to_uprn.drop(
+    columns=["partition", "filename"]
+).rename(columns={"UPRN": "uprn"})
+spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str).astype("Int64")

 mapping_data = mapping_data.merge(
-    clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
+    spatial_data_to_uprn[
+        ["uprn", "LONGITUDE", "LATITUDE", "conservation_status", "is_listed_building", "is_heritage_building"]
+    ],
+    how="left",
+    on="uprn"
 )
 mapping_data = mapping_data.drop(columns=["internal_id"])

@ -95,38 +118,28 @@ with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w")
    f.write(json.dumps(mapping_data.to_dict(orient="records")))

 # We also include some data for visualising the breakdown of EPCS
-proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
 # Invert the true and false
-proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
-proportion_of_real_epcs = proportion_of_real_epcs.rename(
-    columns={"estimated": "is_real_epc"}
-)
+# proportion_of_real_epcs = (~clustering_features["estimated"]).value_counts().to_frame().reset_index()
+# proportion_of_real_epcs = proportion_of_real_epcs.rename(
+#     columns={"estimated": "is_real_epc"}
+# )
+#
+# with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
+#     f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))

-with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
-    f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
+# Produce the breakdown of EPC ratings for properties to be surveyed
+clustering_features["representative_epc"] = clustering_features["representative_sap"].apply(sap_to_epc)

-# Produce the breakdown of EPC ratings
 epc_rating_breakdown = (
-    clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
+    clustering_features[clustering_features["archetype_representative"]]["representative_epc"]
    .value_counts()
    .to_frame()
    .reset_index()
 )

 epc_rating_breakdown = epc_rating_breakdown.rename(
-    columns={"current-energy-rating": "EPC"}
+    columns={"index": "EPC", "representative_epc": "count"}
 )

 with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
    f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
-
-epc_a_properties = clustering_features[
-    (clustering_features["current-energy-rating"] == "A")
-    & (~clustering_features["estimated"])
-    ]
-
-epc_a_properties = epc_a_properties.merge(
-    stonewater_asset_list,
-    on="internal_id",
-    how="inner"
-)
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -13,12 +13,13 @@ import numpy as np
 import pandas as pd
 import time
 from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
-    save_dataframe_to_s3_parquet, save_pickle_to_s3
+    save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from scipy.spatial.distance import cdist
+from sklearn.metrics import pairwise_distances_argmin_min

 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -1083,11 +1084,11 @@ def compile_data():
    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)

    # TODO: Let's store this in s3
-    save_data_to_s3(
-        data=json.dumps(spatial_data_to_uprn.to_dict("records")),
-        s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
-        bucket_name="retrofit-data-dev"
-    )
+    # save_data_to_s3(
+    #     data=json.dumps(spatial_data_to_uprn.to_dict("records")),
+    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
+    #     bucket_name="retrofit-data-dev"
+    # )

    # We merge this spatial data onto final EPCS

@ -1429,17 +1430,17 @@ def compile_data_final():
            older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
    # Store in S3
    # TODO - read in instead of running
-    save_pickle_to_s3(
-        data=epc_data_batch_2,
-        s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
-        bucket_name="retrofit-data-dev"
-    )
-
-    save_pickle_to_s3(
-        data=older_epcs_batch_2,
-        s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
-        bucket_name="retrofit-data-dev"
-    )
+    # save_pickle_to_s3(
+    #     data=epc_data_batch_2,
+    #     s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
+    #     bucket_name="retrofit-data-dev"
+    # )
+    #
+    # save_pickle_to_s3(
+    #     data=older_epcs_batch_2,
+    #     s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
+    #     bucket_name="retrofit-data-dev"
+    # )

    epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
    complete_epcs = pd.concat([epc_data, epc_data_batch_2])
@ -1799,6 +1800,10 @@ def compile_data_final():
        'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
    ]

+    additional_features = [
+
+    ]
+
    # Define the preprocessing for numerical and categorical features
    numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
@ -1957,3 +1962,710 @@ def pull_ideal_postcodes(missing_uprn_with_udprn):
            result["result"]
        )
        completed_id += 1
+
+
+def updated_version():
+    """
+    This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
+    using fewer variables and also factoring in their internal data sources
+
+    This work began on the 23rd July 2024
+    :return:
+    """
+
+    ########################################################################
+    # Read in data
+    ########################################################################
+    asset_list = read_asset_list()
+    asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list)
+
+    # Read in the properties that have been included in Osmosis' wave 2.1
+    osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
+
+    asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)
+
+    # We also check the address & postcode
+    asset_list["In Osmosis Wave 2.1"] = np.where(
+        asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
+        True,
+        asset_list["In Osmosis Wave 2.1"]
+    )
+
+    priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()
+
+    # Pull in the EPC data
+    epc_data = read_epc_data(uprn_lookup_2)
+
+    # Pull in the spatial data to UPRN
+    spatial_data_to_uprn = read_pickle_from_s3(
+        s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+        bucket_name="retrofit-data-dev"
+    )
+
+    # Function to convert specific columns to bool dtype
+    def convert_specific_columns_to_bool(df, columns):
+        for column in columns:
+            if column in df.columns:
+                df[column] = df[column].astype(bool)
+        return df
+
+    spatial_data_to_uprn = [convert_specific_columns_to_bool(
+        df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
+    ) for df in spatial_data_to_uprn]
+
+    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
+    spatial_data_to_uprn = spatial_data_to_uprn.drop(
+        columns=["partition", "filename"]
+    ).rename(columns={"UPRN": "uprn"})
+    spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
+
+    ########################################################################
+    # Prepare the data
+    ########################################################################
+
+    # Filter the asset list down to the priority postcodes
+    asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)
+
+    master_sheet = master_sheet[
+        master_sheet["Address ID"].isin(
+            asset_list["external_address_id"].values
+        )
+    ]
+
+    master_sheet["days_since_lodgement"] = (
+        datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
+    ).dt.days
+
+    asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
+        master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
+        how="left",
+        left_on="external_address_id",
+        right_on="Address ID"
+    )
+
+    asset_list = asset_list.merge(
+        epc_data[["internal_id", "current-energy-efficiency", "lodgement-date", "estimated"]],
+        how="left",
+        on="internal_id"
+    )
+    asset_list["days_since_lodgement_epc"] = (
+        datetime.now() - pd.to_datetime(asset_list["lodgement-date"], errors="coerce", dayfirst=True)
+    ).dt.days
+
+    # Flag properties that were surveyed within the last 5 years
+    asset_list["epc_within_5_years"] = asset_list["days_since_lodgement_epc"] < 5 * 365
+
+    # Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
+    # a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
+    # the EPC is done
+    asset_list["is_epc_c_or_above"] = (
+        ((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
+        (asset_list["EPC Rating"] >= 80)
+    )
+
+    clustering_features = asset_list[
+        asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
+        ~pd.isnull(asset_list["uprn"])
+        ][
+        [
+            "internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
+            "city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
+            "epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
+        ]
+    ]
+
+    # Merge on the SAP data
+    clustering_features = clustering_features.merge(
+        master_sheet[
+            ["Address ID", "SAP"]
+        ].rename(columns={"SAP": "parity_modelled_sap"}),
+        how="left",
+        left_on="external_address_id",
+        right_on="Address ID"
+    )
+
+    # For SAP, we use the most recent EPC if epc_within_5_years is True, otherwise we use the parity modelled sap
+    clustering_features["current-energy-efficiency"] = clustering_features["current-energy-efficiency"].astype(float)
+    clustering_features["representative_sap"] = np.where(
+        clustering_features["epc_within_5_years"],
+        clustering_features["current-energy-efficiency"],
+        clustering_features["parity_modelled_sap"]
+    )
+
+    # We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
+    # is too many
+    clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
+
+    # Merge on spatial features
+    clustering_features = clustering_features.merge(
+        spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
+        how="left",
+        on="uprn"
+    )
+
+    # incorect_epcs = clustering_features[
+    #     clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
+    # incorect_epcs = incorect_epcs[
+    #     ~pd.isnull(incorect_epcs["current-energy-efficiency"]) & pd.isnull(incorect_epcs["estimated"])
+    #     ]
+    # incorect_epcs = incorect_epcs.rename(columns={"current-energy-efficiency": "Current SAP Rating"})
+    # # Store data
+    # incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
+
+    # We add in the key features, which are used for clustering
+    master_sheet_clustering_features = master_sheet[
+        ["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
+    ].copy()
+
+    # Step 1: Remap walls - we end up with 11 types
+    master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
+        {
+            "TimberFrame: AsBuilt": "Other wall type, as built",
+            "SystemBuilt: AsBuilt": "Other wall type, as built",
+            "Sandstone: AsBuilt": "Other wall type, as built",
+            "Sandstone: Internal": "Other wall type, internal or external",
+            "SystemBuilt: External": "Other wall type, internal or external",
+            "GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
+            "TimberFrame: Internal": "Other wall type, internal or external",
+            "Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
+            "SystemBuilt: Internal": "Other wall type, internal or external",
+            "Cavity: Internal": "Other wall type, internal or external",
+        }
+    )
+
+    # Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
+    #         gives us the insulation thickness
+
+    # Clean an incorrect value
+    master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
+        {
+            "PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
+            "PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
+            'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
+            'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
+            'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
+        }
+    )
+
+    master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
+        master_sheet_clustering_features['Roofs'].apply(
+            lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
+        )
+    )
+
+    # Strip any extra whitespace
+    master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
+    master_sheet_clustering_features['roof_insulation_thickness'] = (
+        master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
+    )
+
+    def map_thickness(thickness):
+        try:
+            value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
+            return "Above 250mm" if value > 250 else "Below 250mm"
+        except ValueError:
+            return thickness  # Return the original value if it cannot be converted to a float
+
+    master_sheet_clustering_features['roof_insulation_category'] = (
+        master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
+    )
+
+    # Ideas
+    # 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
+    # as a secondary category
+    # 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
+    # (could split on :, take first part)
+
+    clustering_features = clustering_features.merge(
+        master_sheet_clustering_features,
+        how="left",
+        on="Address ID"
+    )
+
+    # Reduce down to the final set of features we need
+    clustering_features = clustering_features[
+        [
+            "internal_id",
+            "Property Type",
+            # Location
+            "postal_region",
+            'conservation_status',
+            'is_listed_building',
+            'is_heritage_building',
+            "county",
+            # Walls
+            "walls_reduced",
+            # Roof
+            "roof_type",
+            "roof_insulation_category",
+            # Heating
+            "Heating",
+            # Fuel
+            "Main Fuel",
+            "Age",
+            "Total Floor Area",
+            "representative_sap",
+            "days_since_lodgement",
+        ]
+    ]
+
+    clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
+
+    def split_property_type(row):
+        parts = row.split(':')
+        property_type = parts[0].strip()
+        built_form = parts[1].strip() if len(parts) > 1 else ''
+        property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
+        return pd.Series([property_type, built_form, property_extended_feature])
+
+    clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
+        clustering_features['Property Type'].apply(split_property_type)
+    )
+    clustering_features = clustering_features.drop(columns=["Property Type"])
+
+    # These are the variables we MUST split by
+    grouping_columns = [
+        "property_type",
+        "walls_reduced",
+        "roof_type",
+        "Main Fuel",
+        "county",
+    ]
+
+    def combine_small_groups(clustering_features, grouping_columns, threshold=2):
+        # Identify small groups
+        group_sizes = clustering_features.groupby(grouping_columns).size()
+        small_groups = group_sizes[group_sizes <= threshold].index.tolist()
+
+        # Remove small groups from the original clustering_features
+        small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
+        clustering_features_ok = clustering_features[
+            ~clustering_features.set_index(grouping_columns).index.isin(small_groups)
+        ]
+
+        if small_group_data.empty:
+            return clustering_features
+
+        # One-Hot Encode categorical variables
+        categorical_features = (
+            clustering_features_ok.drop(columns=["internal_id"])
+            .select_dtypes(include=['object', 'category']).columns.tolist()
+        )
+        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+        ohe.fit(clustering_features_ok[categorical_features])
+
+        # Combine small groups with the nearest available group
+        small_group_ohe = ohe.transform(small_group_data[categorical_features])
+        large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])
+
+        numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
+        small_group_numerical = small_group_data[numerical_features].values
+        large_group_numerical = clustering_features_ok[numerical_features].values
+
+        # Concatenate one-hot encoded categorical and numerical features
+        small_group_features = np.hstack([small_group_ohe, small_group_numerical])
+        large_group_features = np.hstack([large_group_ohe, large_group_numerical])
+
+        # Calculate distances and find nearest groups
+        closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
+        closest_group_index = clustering_features_ok.iloc[closest_groups].index
+
+        # Update small groups to the nearest large group
+        for small_group, closest_group in zip(small_groups, closest_group_index):
+            small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
+            small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
+                closest_group, grouping_columns].values
+
+        combined_data = pd.concat([clustering_features_ok, small_group_data])
+        return combined_data
+
+    clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
+
+    ########################################################################
+    # Clustering
+    ########################################################################
+    numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
+    categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
+    categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
+
+    for col in categorical_features:
+        clustering_features_combined[col] = clustering_features_combined[col].astype(str)
+
+    id_column = 'internal_id'
+    n_clusters = 450
+    random_state = 0
+
+    training_data_grouped = clustering_features_combined.groupby(grouping_columns)
+    group_sizes = {name: len(group) for name, group in training_data_grouped}
+    total_size = sum(group_sizes.values())
+    cluster_allocation = {
+        name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
+    }
+
+    # Adjust cluster allocation to ensure total clusters sum to 450
+    cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
+
+    final_clusters = []
+    for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
+
+        group_n_clusters = cluster_allocation[group_variables]
+        group_data.set_index(id_column, inplace=True)
+
+        preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', StandardScaler(), numerical_features),
+                ('cat', OneHotEncoder(), categorical_features)
+            ]
+        )
+
+        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
+                                   ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
+
+        # Fit the pipeline to the data
+        pipeline.fit(group_data)
+
+        # Transform the data using the fitted pipeline
+        processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
+
+        # Get cluster labels
+        group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
+
+        # Get centroids (already in the same transformed space)
+        centroids = pipeline.named_steps['kmeans'].cluster_centers_
+
+        # if the data isn't an array, make it one
+        if not isinstance(processed_data, np.ndarray):
+            processed_data = processed_data.toarray()
+
+        # Calculate distances from each point to the centroid of its cluster
+        distances_to_centroids = [
+            cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
+            for i, label in enumerate(group_data['cluster'])
+        ]
+
+        group_data['distance_to_centroid'] = distances_to_centroids
+
+        # Ranking rows by distance within each cluster
+        group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
+
+        # Sorting to verify
+        group_data.sort_values(by=['cluster', 'rank'], inplace=True)
+        group_data.reset_index(inplace=True)
+
+        to_append = group_data[["internal_id", "cluster", "rank"]].copy()
+        to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
+        final_clusters.append(to_append)
+
+    final_clusters = pd.concat(final_clusters)
+    # remap the clusters from the current names to 1 -> n_clusters
+    cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
+    final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
+    final_clusters["cluster"] = final_clusters["cluster"].astype(str)
+
+    assigned_clusters = clustering_features_combined.merge(
+        final_clusters, how="left", on="internal_id"
+    )
+
+    assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
+
+    asset_list_with_archetypes = asset_list.merge(
+        assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
+        on="internal_id"
+    ).merge(
+        master_sheet_clustering_features[["Address ID", "Property Type", "Walls", "Roofs", "Heating"]],
+        how="left",
+        on="Address ID"
+    )
+
+    # We populate the reasons for no archetype
+    # 1) If it's not a priority postcode
+    asset_list_with_archetypes["cluster"] = np.where(
+        ~asset_list_with_archetypes["is_priority_postcode"],
+        "NOT PRIORITY POSTCODE",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # 2) If it's EPC C or above
+    asset_list_with_archetypes["cluster"] = np.where(
+        asset_list_with_archetypes["is_epc_c_or_above"],
+        "EPC C OR ABOVE",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # If it's in Wave 2.1
+    asset_list_with_archetypes["cluster"] = np.where(
+        asset_list_with_archetypes["In Osmosis Wave 2.1"],
+        "IN WAVE 2.1",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # Has missing uprn
+    asset_list_with_archetypes["cluster"] = np.where(
+        pd.isnull(asset_list_with_archetypes["uprn"]),
+        "MISSING UPRN",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
+
+    asset_list_with_archetypes["archetype_representative"] = (
+        asset_list_with_archetypes["archetype_representative"].fillna(False)
+    )
+
+    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.1.csv", index=False)
+
+    # Produce the archetyping features
+    archetyping_features_csv = assigned_clusters[
+        [
+            "internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
+            "is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
+        ]
+    ].merge(
+        asset_list[
+            ["internal_id", "uprn", "external_address_id"]
+        ],
+        how="left",
+        on="internal_id"
+    ).merge(
+        master_sheet_clustering_features,
+        how="left",
+        right_on="Address ID",
+        left_on="external_address_id"
+    ).drop(columns=["Address ID"]).rename(
+        columns={
+            "internal_id": "Osm. ID",
+            "external_address_id": "Address ID",
+        }
+    )
+
+    archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
+    archetyping_features_csv.to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
+    )
+
+    representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
+    print(representatives["postal_region"].nunique())
+    print(representatives["county"].nunique())
+
+
+def read_asset_list():
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+        header=4
+    )
+
+    udprn_data = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
+    )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
+    udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
+    udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
+
+    asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
+    asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
+
+    asset_list = asset_list.rename(
+        columns={
+            "Osm. ID": "internal_id",
+            "Org. ref.": "customer_asset_id",
+            "Postcode": "postcode",
+            "House no": "house_number",
+            "Name": "address1",
+            "Address line 2": "address2",
+            "City/Town": "city_town",
+            "County": "county",
+            "Address ID": "external_address_id",
+            "Owning body": "owner"
+        }
+    )
+
+    asset_list["full_address"] = np.where(
+        ~pd.isnull(asset_list["address2"]),
+        (
+            asset_list["address1"] + ", " +
+            asset_list["address2"] + ", " +
+            asset_list["city_town"].str.title() + ", " +
+            asset_list["postcode"]
+        ),
+        asset_list["address1"] + ", " +
+        asset_list["city_town"].str.title() + ", " +
+        asset_list["postcode"]
+    )
+    return asset_list
+
+
+def merge_uprn_to_asset_list(asset_list):
+    # Read in the lookups
+    uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
+        bucket_name="retrofit-data-dev",
+        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
+    )))
+    uprn_lookup_1["match_type"] = "Exact"
+
+    uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
+        bucket_name="retrofit-data-dev",
+        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
+    )))
+    uprn_lookup_2 = uprn_lookup_2.rename(
+        columns={
+            "epc_address": "standardised_address",
+            "epc_postcode": "standardised_postcode"
+        }
+    )
+    uprn_lookup_2["match_type"] = "EPC"
+    uprn_lookup_2["uprn"] = np.where(
+        uprn_lookup_2["internal_id"] == 1091,
+        83143766,
+        uprn_lookup_2["uprn"]
+    )
+
+    uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
+        bucket_name="retrofit-data-dev",
+        s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
+    )))
+    uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
+        concatenate_row, axis=1
+    )
+    uprn_lookup_3 = uprn_lookup_3[
+        ["udprn", "uprn", "standardised_address", "postcode"]
+    ].rename(columns={"postcode": "standardised_postcode"})
+    uprn_lookup_3["match_type"] = "Exact"
+
+    uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
+    uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
+    uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
+    # prepare lookup 4
+    uprn_lookup_4 = []
+    for _, x in uprn_lookup_4_basis.iterrows():
+
+        property_type = None
+        built_form = None
+        if x["option"] == 1:
+            uprn = x["os_option_1_uprn"]
+            standardised_address = x["os_option_1_address"]
+            postcode = x["os_option_1_postcode"]
+        elif x["option"] == 2:
+            uprn = x["os_option_2_uprn"]
+            standardised_address = x["os_option_2_address"]
+            postcode = x["os_option_2_address"].split(", ")[-1]
+        else:
+            uprn = x["manual_uprn"]
+            standardised_address = x["manual_address"]
+            postcode = x["manual_postcode"]
+
+        uprn_lookup_4.append(
+            {
+                "internal_id": x["internal_id"],
+                "external_address_id": x["external_address_id"],
+                "uprn": uprn,
+                "standardised_address": standardised_address,
+                "standardised_postcode": postcode,
+                "property_type": property_type,
+                "built_form": built_form
+            }
+        )
+    uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
+    uprn_lookup_4["match_type"] = "Fuzzy"
+
+    # concat
+    uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
+
+    assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
+
+    # Final preps of lookups
+    uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
+    uprn_lookup_3 = uprn_lookup_3.merge(
+        asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
+    )
+    uprn_lookup = pd.concat([
+        uprn_lookup,
+        uprn_lookup_3,
+        uprn_lookup_4
+    ])
+    uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
+
+    asset_list = asset_list.merge(
+        uprn_lookup.drop(columns=["udprn"]),
+        how="inner",
+        on=["internal_id", "external_address_id"]
+    )
+
+    return asset_list, uprn_lookup_2
+
+
+def read_omosis_wave_2_1():
+    osmosis_wave_2_1 = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
+        header=4,
+    )
+    # Remove double spaces from "Name"
+    osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace("  ", " ")
+
+    osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
+    osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
+    # We produce a cleaned list of asset ids from osmosis_wave_2_1
+    osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
+    # We have some ids that are in the form 'id1, id2' so we split them
+    osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]
+
+    return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1
+
+
+def read_stonewater_asset_data():
+    master_sheet = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
+        "sheet.csv",
+        encoding='latin1'
+    )
+
+    master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)
+
+    previous_waves = master_sheet[
+        (master_sheet["In Osmosis W2.1"] == "Yes") |
+        (master_sheet["In Wates Wave 2.1"] == "Yes") |
+        (master_sheet["In Liv Green Wave 2.1"] == "Yes") |
+        (master_sheet["In CCS Wave 2.1"] == "Yes")
+        ].copy()
+
+    previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]
+
+    # We also read the priority postcodes
+    priority_postcodes = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
+        "postcodes.csv",
+        header=17
+    )
+
+    priority_postcodes = priority_postcodes["Postcode"].tolist()
+
+    return priority_postcodes, previous_waves_address_id, master_sheet
+
+
+def read_epc_data(uprn_lookup_2):
+    epc_data = json.loads(
+        read_from_s3(
+            bucket_name="retrofit-data-dev",
+            s3_file_name="customers/Stonewater/clustering/epc_data.json"
+        )
+    )
+    epc_data = pd.DataFrame(epc_data)
+
+    epc_data["uprn"] = np.where(
+        epc_data["internal_id"] == 1091,
+        83143766,
+        epc_data["uprn"]
+    )
+
+    # We drop come EPCS
+    epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
+
+    epc_data_batch_2 = read_pickle_from_s3(
+        s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
+        bucket_name="retrofit-data-dev"
+    )
+    epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
+
+    complete_epcs = pd.concat([epc_data, epc_data_batch_2])
+
+    return complete_epcs
--- a/etl/sfr/epc_average_by_postcode.py
+++ b/etl/sfr/epc_average_by_postcode.py
@ -0,0 +1,80 @@
+import os
+from tqdm import tqdm
+import pandas as pd
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+from backend.app.utils import sap_to_epc
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def app():
+    """
+    This script will retrieve EPC data, for postcodes and produce statistics on the SAP Score
+    :return:
+    """
+
+    source_file = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Addresses - SFR rents.xlsx")
+    source_file["row_id"] = source_file.index
+    # Split out the town, which is the final portion of the string, separated by commas
+    source_file["Town"] = source_file["Address"].apply(lambda x: x.split(" ")[-1].strip() if not pd.isnull(x) else None)
+    source_file["Address"] = source_file["Address"].apply(
+        lambda x: " ".join(x.split(" ")[:-1]).strip() if not pd.isnull(x) else None
+    )
+
+    unique_postcodes = source_file[["Address", "Postcode"]].drop_duplicates()
+
+    # for each postcode, pull EPC data
+    collected_data = []
+    no_data_found = []
+    no_data_after_filters = []
+    for _, config in tqdm(unique_postcodes.iterrows(), total=len(unique_postcodes)):
+        address1 = config["Address"] if not pd.isnull(config["Address"]) else ""
+        searcher = SearchEpc(
+            postcode=config["Postcode"],
+            address1=address1,
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key=""
+        )
+        while True:
+            params = {
+                "postcode": config["Postcode"],
+                "address": address1,
+            }
+            results = searcher.client.domestic.search(params=params, size=10000)
+            if not results:
+                # We strip back address1
+                address1 = " ".join(address1.split(" ")[:-1])
+                if not address1:
+                    break
+            else:
+                break
+
+        if not results:
+            no_data_found.append(config)
+            continue
+
+        data = pd.DataFrame(results["rows"])
+
+        data["current-energy-efficiency"] = data["current-energy-efficiency"].astype(int)
+        # Take EPCs post 2023
+        data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], errors="coerce")
+        data = data[data["lodgement-date"] >= "2023-01-01"]
+        # Take private nrentals
+        data = data[data["tenure"].isin(["rental (private)", "Rented (private)"])]
+
+        if data.empty:
+            no_data_after_filters.append(config)
+            continue
+
+        agg = data.groupby(["property-type", "built-form"])["current-energy-efficiency"].mean().reset_index()
+        agg = agg.rename(columns={"current-energy-efficiency": "Average SAP"})
+        agg["Average EPC"] = agg["Average SAP"].apply(sap_to_epc)
+        agg.insert(0, "Postcode", config["Postcode"])
+        agg.insert(0, "Address", address1)
+
+        collected_data.append(agg)
+
+    collected_df = pd.concat(collected_data)
+    collected_df.to_csv("EPC Averages SFR.csv", index=False)
--- a/etl/sfr/example_retrofit_plan.py
+++ b/etl/sfr/example_retrofit_plan.py
@ -0,0 +1,37 @@
+import pandas as pd
+from utils.s3 import save_csv_to_s3
+
+PORTFOLIO_ID = 85
+USER_ID = 8
+
+
+def app():
+    asset_list = [
+        {
+            "address": "120 Yarningale Road",
+            "postcode": "B14 6NB",
+            "uprn": 100070575194
+        }
+    ]
+
+    asset_list = pd.DataFrame(asset_list)
+
+    filename = f"{USER_ID}/{PORTFOLIO_ID}/sample.csv"
+    save_csv_to_s3(
+        dataframe=asset_list,
+        bucket_name="retrofit-plan-inputs-dev",
+        file_name=filename
+    )
+
+    body = {
+        "portfolio_id": str(PORTFOLIO_ID),
+        "housing_type": "Private",
+        "goal": "Increase EPC",
+        "goal_value": "C",
+        "trigger_file_path": filename,
+        "already_installed_file_path": "",
+        "patches_file_path": "",
+        "non_invasive_recommendations_file_path": "",
+        "budget": None,
+    }
+    print(body)
--- a/etl/xml_survey_extraction/README.md
+++ b/etl/xml_survey_extraction/README.md
@ -0,0 +1,3 @@
+# Survey Extraction App
+
+This app is responsible survey data from energy assessment XMLs
--- a/etl/xml_survey_extraction/app.py
+++ b/etl/xml_survey_extraction/app.py
@ -0,0 +1,9 @@
+def main():
+    """
+    This function executes the main process, which will retrieve data from the specified locations, extract the data
+    fields and store them to our database
+    :return:
+    """
+
+    # TODO: Build solution to get this data from Onedrive and store what we need in S3
+    #       In s3, we have a bucket called retrofit-energy-assessments-{stage} which
--- a/infrastructure/terraform/main.tf
+++ b/infrastructure/terraform/main.tf
@ -175,6 +175,12 @@ module "retrofit_hotwater_kwh_predictions" {
  allowed_origins = var.allowed_origins
 }

+module "retrofit_energy_assessments" {
+  source          = "./modules/s3"
+  bucketname      = "retrofit-energy-assessments-${var.stage}"
+  allowed_origins = var.allowed_origins
+}
+
 # Set up the route53 record for the API
 module "route53" {
  source         = "./modules/route53"
--- a/recommendations/HeatingControlRecommender.py
+++ b/recommendations/HeatingControlRecommender.py
@ -100,9 +100,10 @@ class HeatingControlRecommender:
        We can then consider the heating system itself
        :return:
        """
+        new_description = "Controls for high heat retention storage heaters"

        # We recommend upgrading to Celect type controls
-        ending_config = MainheatControlAttributes("Controls for high heat retention storage heaters").process()
+        ending_config = MainheatControlAttributes(new_description).process()
        # We look at what has changed in the ending config, and compare it to the current config
        simulation_config = check_simulation_difference(
            new_config=ending_config, old_config=self.property.main_heating_controls
@ -110,11 +111,17 @@ class HeatingControlRecommender:
        # This upgrade will only take the heating system to average energy efficiency
        simulation_config["mainheatc_energy_eff_ending"] = "Good"

+        description_simulation = {
+            "mainheatcont-description": new_description,
+            "mainheatc-energy-eff": simulation_config["mainheatc_energy_eff_ending"]
+        }
+
        self.recommendation.append(
            {
                "description": "upgrade heating controls to High Heat Retention Storage Heater Controls",
                **self.costs.celect_type_controls(),
-                "simulation_config": simulation_config
+                "simulation_config": simulation_config,
+                "description_simulation": description_simulation
            }
        )

@ -152,7 +159,9 @@ class HeatingControlRecommender:
        if not can_recommend:
            return

-        ending_config = MainheatControlAttributes("Programmer, room thermostat and TRVS").process()
+        new_controls_description = "Programmer, room thermostat and TRVS"
+
+        ending_config = MainheatControlAttributes(new_controls_description).process()
        # We use this to determine how we should be updating the config
        simulation_config = check_simulation_difference(
            new_config=ending_config, old_config=self.property.main_heating_controls
@ -161,6 +170,13 @@ class HeatingControlRecommender:
        # If the current system is below good, we make it good
        if self.property.data["mainheatc-energy-eff"] in ["Poor", "Very Poor", "Average"]:
            simulation_config["mainheatc_energy_eff_ending"] = "Good"
+        else:
+            simulation_config["mainheatc_energy_eff_ending"] = self.property.data["mainheatc-energy-eff"]
+
+        description_simulation = {
+            "mainheatcont-description": new_controls_description,
+            "mainheatc-energy-eff": simulation_config["mainheatc_energy_eff_ending"]
+        }

        has_programmer = not needs_programmer
        has_room_thermostat = not needs_room_thermostat
@ -191,10 +207,7 @@ class HeatingControlRecommender:
                "sap_points": None,
                "already_installed": already_installed,
                "simulation_config": simulation_config,
-                "description_simulation": {
-                    "mainheatcont-description": "Programmer, room thermostat and TRVS",
-                    "mainheatc-energy-eff": "Good"
-                }
+                "description_simulation": description_simulation
            }
        )

@ -221,7 +234,9 @@ class HeatingControlRecommender:
            # No recommendation needed
            return

-        ending_config = MainheatControlAttributes("Time and temperature zone control").process()
+        new_controls_description = "Time and temperature zone control"
+
+        ending_config = MainheatControlAttributes(new_controls_description).process()

        # We use this to determine how we should be updating the config
        simulation_config = check_simulation_difference(
@ -231,7 +246,13 @@ class HeatingControlRecommender:
        # If the current system is below very good, we make it very good
        if self.property.data["mainheatc-energy-eff"] in ["Poor", "Very Poor", "Average", "Good"]:
            simulation_config["mainheatc_energy_eff_ending"] = "Very Good"
+        else:
+            simulation_config["mainheatc_energy_eff_ending"] = self.property.data["mainheatc-energy-eff"]

+        description_simulation = {
+            "mainheatcont-description": new_controls_description,
+            "mainheatc-energy-eff": simulation_config["mainheatc_energy_eff_ending"]
+        }
        cost_result = self.costs.time_and_temperature_zone_control(
            number_heated_rooms=int(self.property.data["number-heated-rooms"])
        )
@ -255,9 +276,6 @@ class HeatingControlRecommender:
                "sap_points": None,
                "already_installed": already_installed,
                "simulation_config": simulation_config,
-                "description_simulation": {
-                    "mainheatcont-description": "Time and temperature zone control",
-                    "mainheatc-energy-eff": "Very Good"
-                }
+                "description_simulation": description_simulation
            }
        )
--- a/recommendations/HeatingRecommender.py
+++ b/recommendations/HeatingRecommender.py
@ -55,7 +55,7 @@ class HeatingRecommender:
        # TODO: We could have a system flush recommendation for an existing boiler, where there is no need to replace
        #       the boiler, but instead flushing the system will make it run more efficiently. There is a cost for this
        #       in the Costs class, stored as SYSTEM_FLUSH_COST
-        
+
        exclusions = [] if exclusions is None else exclusions

        self.heating_recommendations = []
@ -86,7 +86,8 @@ class HeatingRecommender:
        electic_heating_has_mains = self.has_electric_heating_description and self.property.data["mains-gas-flag"]

        portable_heaters_has_mains = (
-            self.property.main_heating["clean_description"] in ["Portable electric heaters assumed for most rooms"] and
+            self.property.main_heating["clean_description"] in ["Portable electric heaters assumed for most rooms"]
+            and
            self.property.data["mains-gas-flag"]
        )

@ -238,28 +239,31 @@ class HeatingRecommender:
                description = description + (f" The cost includes the £"
                                             f"{BOILER_UPGRADE_SCHEME_ASHP_VALUE} boiler upgrade scheme grant")

+        new_heating_description = "Air source heat pump, radiators, electric"
+        new_hot_water_description = "From main system"
        simulation_config = {
            "mainheat_energy_eff_ending": "Good",
            "hot_water_energy_eff_ending": "Good"
        }
        description_simulation = {
-            "mainheat-description": "Air source heat pump, radiators, electric",
-            "mainheat-energy-eff": "Good",
-            "hot-water-energy-eff": "Good",
-            "hotwater-description": "From main system",
+            "mainheat-description": new_heating_description,
+            "mainheat-energy-eff": simulation_config["mainheat_energy_eff_ending"],
+            "hot-water-energy-eff": simulation_config["hot_water_energy_eff_ending"],
+            "hotwater-description": new_hot_water_description,
        }
        # Installation of a boiler improves the hot water system so we need to reflect this in
        # the outcome of the recommendation
-        heating_ending_config = MainHeatAttributes("Air source heat pump, radiators, electric").process()
-        hotwater_ending_config = HotWaterAttributes("From main system").process()
+        heating_ending_config = MainHeatAttributes(new_heating_description).process()
+        hotwater_ending_config = HotWaterAttributes(new_hot_water_description).process()

        # If the property does not currently have electric main fuel, we'll simulate the change
        fuel_ending_config = {}
        if self.property.main_fuel["fuel_type"] != "electricity":
-            fuel_ending_config = MainFuelAttributes("electricity (not community)").process()
+            new_fuel_description = "electricity (not community)"
+            fuel_ending_config = MainFuelAttributes(new_fuel_description).process()
            description_simulation = {
                **description_simulation,
-                "main-fuel": "electricity (not community)"
+                "main-fuel": new_fuel_description
            }

        # Check the simulation differences
@ -292,8 +296,7 @@ class HeatingRecommender:

            description_simulation = {
                **description_simulation,
-                "mainheatcont-description": "time and temperature zone control",
-                "mainheatc-energy-eff": "Very Good"
+                **controls_recommender.recommendation[0]["description_simulation"]
            }

        ashp_recommendation = {
@ -330,7 +333,14 @@ class HeatingRecommender:
        return differences

    def combine_heating_and_controls(
-        self, controls_recommendations, heating_simulation_config, costs, description, phase, heating_controls_only,
+        self,
+        controls_recommendations,
+        heating_simulation_config,
+        heating_description_simulation,
+        costs,
+        description,
+        phase,
+        heating_controls_only,
        system_change
    ):
        """
@ -338,6 +348,7 @@ class HeatingRecommender:
        into a single recommendation
        :param controls_recommendations: The heating controls recommendations
        :param heating_simulation_config: The simulation configuration for the heating system
+        :param heating_description_simulation: The simulation configuration for the heating description
        :param costs: The costs of the heating system
        :param description: The description of the recommendation
        :param phase: The phase of the recommendation
@ -361,6 +372,7 @@ class HeatingRecommender:
        for controls_switch in heating_controls_switch:
            total_costs = costs.copy()
            recommendation_simulation_config = heating_simulation_config.copy()
+            recommendation_description_simulation = heating_description_simulation.copy()
            recommendation_description = description
            if controls_switch:
                # We add the costs of the heating controls, onto each key in the costs dictionary
@ -371,6 +383,12 @@ class HeatingRecommender:
                    **recommendation_simulation_config,
                    **controls_recommendations[0]["simulation_config"]
                }
+
+                recommendation_description_simulation = {
+                    **recommendation_description_simulation,
+                    **controls_recommendations[0]["description_simulation"]
+                }
+
                controls_description = controls_recommendations[0]['description']
                # Make the first letter of the description lowercase
                controls_description = (
@ -396,7 +414,8 @@ class HeatingRecommender:
                "sap_points": None,
                "already_installed": already_installed,
                **total_costs,
-                "simulation_config": recommendation_simulation_config
+                "simulation_config": recommendation_simulation_config,
+                "description_simulation": recommendation_description_simulation
            }

            output.append(recommendation)
@ -474,8 +493,10 @@ class HeatingRecommender:
            # No recommendation needed
            return

+        new_heating_description = "Electric storage heaters, radiators"
+
        # Set up artefacts, suitable for the simulation and regardless of controls
-        heating_ending_config = MainHeatAttributes("Electric storage heaters, radiators").process()
+        heating_ending_config = MainHeatAttributes(new_heating_description).process()
        heating_simulation_config = check_simulation_difference(
            new_config=heating_ending_config, old_config=self.property.main_heating
        )
@ -497,9 +518,15 @@ class HeatingRecommender:
        )
        description = "Install high heat retention electric storage heaters"

+        heating_description_simulation = {
+            "mainheat-description": new_heating_description,
+            "mainheat-energy-eff": heating_simulation_config["mainheat_energy_eff_ending"],
+        }
+
        recommendations = self.combine_heating_and_controls(
            controls_recommendations=controls_recommender.recommendation,
            heating_simulation_config=heating_simulation_config,
+            heating_description_simulation=heating_description_simulation,
            costs=costs,
            description=description,
            phase=phase,
@ -580,6 +607,7 @@ class HeatingRecommender:
        simulation_config = {}
        boiler_costs = {}
        boiler_recommendation = {}
+        description_simulation = {}

        has_inefficient_space_heating = self.property.data["mainheat-energy-eff"] in ["Very Poor", "Poor", "Average"]

@ -603,12 +631,22 @@ class HeatingRecommender:
                "mainheat_energy_eff_ending": "Good",
                "hot_water_energy_eff_ending": "Good"
            }
+
+            description_simulation = {
+                "mainheat-energy-eff": simulation_config["mainheat_energy_eff_ending"],
+                "hot-water-energy-eff": simulation_config["hot_water_energy_eff_ending"],
+            }
+
            if system_change:
                # Installation of a boiler improves the hot water system so we need to reflect this in
                # the outcome of the recommendation
-                heating_ending_config = MainHeatAttributes("Boiler and radiators, mains gas").process()
-                hotwater_ending_config = HotWaterAttributes("From main system").process()
-                fuel_ending_config = MainFuelAttributes("mains gas (not community)").process()
+                new_heating_description = "Boiler and radiators, mains gas"
+                new_hotwater_description = "From main system"
+                new_fuel_description = "mains gas (not community)"
+
+                heating_ending_config = MainHeatAttributes(new_heating_description).process()
+                hotwater_ending_config = HotWaterAttributes(new_hotwater_description).process()
+                fuel_ending_config = MainFuelAttributes(new_fuel_description).process()

                heating_simulation_config = check_simulation_difference(
                    new_config=heating_ending_config, old_config=self.property.main_heating
@ -627,6 +665,13 @@ class HeatingRecommender:
                    **fuel_simulation_config,
                }

+                description_simulation = {
+                    **description_simulation,
+                    "mainheat-description": new_heating_description,
+                    "hotwater-description": new_hotwater_description,
+                    "main-fuel": new_fuel_description
+                }
+
            boiler_costs = self.costs.boiler(
                size=f"{boiler_size}kw",
                exising_room_heaters=exising_room_heaters,
@ -652,6 +697,7 @@ class HeatingRecommender:
                "sap_points": None,
                "already_installed": already_installed,
                "simulation_config": simulation_config,
+                "description_simulation": description_simulation,
                **boiler_costs
            }

@ -675,6 +721,7 @@ class HeatingRecommender:
                combined_recommendation = self.combine_heating_and_controls(
                    controls_recommendations=[controls_recommendation],
                    heating_simulation_config=simulation_config,
+                    heating_description_simulation=description_simulation,
                    costs=boiler_costs,
                    description=boiler_recommendation["description"],
                    phase=recommendation_phase,
--- a/recommendations/Recommendations.py
+++ b/recommendations/Recommendations.py
@ -782,6 +782,11 @@ class Recommendations:
                        }
                    }

+                # Prevent from being negative
+                predicted_sap_points = 0 if predicted_sap_points < 0 else predicted_sap_points
+                predicted_co2_savings = 0 if predicted_co2_savings < 0 else predicted_co2_savings
+                predicted_heat_demand = 0 if predicted_heat_demand < 0 else predicted_heat_demand
+
                if rec["type"] == "low_energy_lighting":
                    # For the moment, we cap the number of SAP points that can be achieved by ventilation at 2
                    rec["sap_points"] = min(predicted_sap_points, LightingRecommendations.SAP_LIMIT)
--- a/recommendations/WallRecommendations.py
+++ b/recommendations/WallRecommendations.py
@ -524,6 +524,10 @@ class WallRecommendations(Definitions):
                            "already_installed": already_installed,
                            "sap_points": None,
                            "simulation_config": simulation_config,
+                            "description_simulation": {
+                                "walls-description": new_description,
+                                "walls-energy-eff": simulation_config["walls_energy_eff_ending"]
+                            },
                            **cost_result
                        }
                    )