Merge pull request #323 from Hestia-Homes/solar-api

Solar api
2026-07-27 23:35:01 +00:00 · 2024-07-23 20:11:05 +01:00 · 2024-07-23 20:11:05 +01:00 · 1dafedc733
commit 1dafedc733
parent 688ff0588c 36b3b4bea5
8 changed files with 894 additions and 101 deletions
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@ -133,7 +133,7 @@ def app():
    energy_consumption_data = []
    for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
        # Skip the first 50
-        if i < 127:
+        if i < 250:
            continue

        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
--- a/etl/customers/stonewater/map_app/Stonewater
+++ b/etl/customers/stonewater/map_app/Stonewater
@ -1 +1 @@
-[{"EPC": "D", "count": 1718}, {"EPC": "C", "count": 1343}, {"EPC": "E", "count": 538}, {"EPC": "F", "count": 80}, {"EPC": "B", "count": 52}, {"EPC": "G", "count": 3}, {"EPC": "A", "count": 2}]
+[{"EPC": "D", "count": 332}, {"EPC": "C", "count": 68}, {"EPC": "E", "count": 44}, {"EPC": "F", "count": 6}]
--- a/etl/customers/stonewater/map_app/Stonewater
+++ b/etl/customers/stonewater/map_app/Stonewater
--- a/etl/customers/stonewater/map_app/Stonewater
+++ b/etl/customers/stonewater/map_app/Stonewater
@ -1 +1 @@
-[{"is_real_epc": true, "count": 3736}, {"is_real_epc": false, "count": 1509}]
+[{"index": true, "is_real_epc": 3736}, {"index": false, "is_real_epc": 1509}]
--- a/etl/customers/stonewater/map_app/map_page.py
+++ b/etl/customers/stonewater/map_app/map_page.py
@ -31,7 +31,8 @@ def make_epc_rating_piechart(epc_rating_breakdown):
    labels = [x["EPC"] for x in epc_rating_breakdown]
    values = [x["count"] for x in epc_rating_breakdown]

-    marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
+    # marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
+    marker_colors = ["#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]

    fig = go.Figure(
        data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors, sort=False)],
@ -53,7 +54,10 @@ def make_map(locations):
    # Create custom hover text
    df['hover_text'] = df.apply(
        lambda row: f"UPRN: {int(row['uprn'])}<br>Address: {row['standardised_address']}<br>Postcode: "
-                    f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: {row['LONGITUDE']}",
+                    f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: "
+                    f"{row['LONGITUDE']}<br>Walls: {row['Walls']}<br>Roofs: {row['Roofs']}<br>Main Fuel: "
+                    f"{row['Main Fuel']}<br>Heating: {row['Heating']}<br>Age: {row['Age']}<br>Property Type: "
+                    f"{row['Property Type']}",
        axis=1)

    data = [
@ -93,8 +97,8 @@ def layout():
        locations = json.load(file)

    # Get the EPC breakdown data
-    with open("Stonewater real EPC breakdown.json") as file:
-        real_epc_breakdown = json.load(file)
+    # with open("Stonewater real EPC breakdown.json") as file:
+    #     real_epc_breakdown = json.load(file)

    # Get the EPC ratings data
    with open("Stonewater EPC rating breakdown.json") as file:
@ -149,7 +153,8 @@ def layout():
                                style={"font-size": "2.5rem", "font-weight": "bold", "margin-bottom": "20px"}
                            ),
                            html.P(
-                                "This map shows the location of the properties that are to be surveyed by Osmosis.",
+                                "This map shows the location of the properties that are to be surveyed by Osmosis. "
+                                "These properties span across 30 counties and 155 postal regions",
                                style={"font-size": "1.25rem", "margin-bottom": "40px"}
                            ),
                        ],
@ -170,22 +175,22 @@ def layout():
            ),
            dbc.Row(
                [
-                    dbc.Col(
-                        [
-                            html.Div(
-                                "Breakdown of real EPCs",
-                                style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
-                                className='text-center'
-                            ),
-                            html.Div(
-                                "This pie chart shows the proportion of real EPCs in the asset list. Currently, "
-                                "there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
-                                style={"marginBottom": "1em"}
-                            ),
-                            make_real_epc_piechart(real_epc_breakdown),
-                        ],
-                        width={"size": 5},
-                    ),
+                    # dbc.Col(
+                    #     [
+                    #         html.Div(
+                    #             "Breakdown of real EPCs",
+                    #             style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
+                    #             className='text-center'
+                    #         ),
+                    #         html.Div(
+                    #             "This pie chart shows the proportion of real EPCs in the asset list. Currently, "
+                    #             "there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
+                    #             style={"marginBottom": "1em"}
+                    #         ),
+                    #         make_real_epc_piechart(real_epc_breakdown),
+                    #     ],
+                    #     width={"size": 5},
+                    # ),
                    dbc.Col(
                        [
                            html.Div(
@ -195,22 +200,9 @@ def layout():
                            ),
                            html.Div(
                                [
-                                    "This pie chart shows the breakdown of EPC ratings, for properties that currently "
-                                    "have an EPC. "
-                                    "The ratings range from A to G, where surprisingly, there are two EPC properties "
-                                    "that were initially "
-                                    "expected by Parity's modelled SAP, to be EPC D or below. These properties can be"
-                                    " seen ",
-                                    html.A("here",
-                                           href="https://find-energy-certificate.service.gov.uk/energy-certificate"
-                                                "/2708-5001-7327-6090-7284",
-                                           target="_blank"),
-                                    " and ",
-                                    html.A("here",
-                                           href="https://find-energy-certificate.service.gov.uk/energy-certificate"
-                                                "/1037-4032-1009-0361-7292",
-                                           target="_blank"),
-                                    "."
+                                    "This pie chart shows the breakdown of expected and real EPC ratings, "
+                                    "for properties "
+                                    "that have been selected for sample",
                                ],
                                style={"marginBottom": "1em"}
                            ),
--- a/etl/customers/stonewater/outputs
+++ b/etl/customers/stonewater/outputs
@ -11,8 +11,9 @@ In this script, we do the following things:
 import pandas as pd
 import json
 from utils.s3 import read_pickle_from_s3
+from backend.app.utils import sap_to_epc

-stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
+stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V3.csv")
 archetyped_asset_list = stonewater_asset_list[
    [
        "internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
@ -25,28 +26,15 @@ archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
 archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])

 # Read in and merge on clustering features
-clustering_features = read_pickle_from_s3(
-    bucket_name="retrofit-data-dev",
-    s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
+clustering_features = pd.read_csv(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv",
 )

-# Move property-type and built-form to the first two columns
-columns_to_move = ['property-type', 'built-form']
-
-# Get the remaining columns
-remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
-
-# Create the new column order
-new_column_order = columns_to_move + remaining_columns
-
-# Reorder the DataFrame
-clustering_features = clustering_features[new_column_order]
-
 archetyped_asset_list = archetyped_asset_list.merge(
-    clustering_features,
-    on="internal_id",
-    how="inner"
-)
+    clustering_features.drop(columns=['uprn', 'Address ID', "rank", "cluster", "archetype_representative"]),
+    left_on="internal_id",
+    right_on="Osm. ID"
+).drop(columns=["Osm. ID"])

 archetyped_asset_list = archetyped_asset_list.rename(
    columns={
@ -82,12 +70,47 @@ archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
 # archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)

 # We store the location data, which will be used for the mapping. We just need the longitude and latitude
+stonewater_asset_list["uprn"] = stonewater_asset_list["uprn"].astype('Int64')
+
 mapping_data = stonewater_asset_list[
    stonewater_asset_list["archetype_representative"]
-][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
+][["internal_id", "uprn", "standardised_address", "standardised_postcode"]].merge(
+    archetyped_asset_list[["uprn", "Walls", "Roofs", "Main Fuel", "Heating", "Age", "Property Type"]],
+    how="left",
+    on="uprn"
+)
+
+# We need to merge on longitude and latitude
+spatial_data_to_uprn = read_pickle_from_s3(
+    s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+    bucket_name="retrofit-data-dev"
+)
+
+
+# Function to convert specific columns to bool dtype
+def convert_specific_columns_to_bool(df, columns):
+    for column in columns:
+        if column in df.columns:
+            df[column] = df[column].astype(bool)
+    return df
+
+
+spatial_data_to_uprn = [convert_specific_columns_to_bool(
+    df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
+) for df in spatial_data_to_uprn]
+
+spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
+spatial_data_to_uprn = spatial_data_to_uprn.drop(
+    columns=["partition", "filename"]
+).rename(columns={"UPRN": "uprn"})
+spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str).astype("Int64")

 mapping_data = mapping_data.merge(
-    clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
+    spatial_data_to_uprn[
+        ["uprn", "LONGITUDE", "LATITUDE", "conservation_status", "is_listed_building", "is_heritage_building"]
+    ],
+    how="left",
+    on="uprn"
 )
 mapping_data = mapping_data.drop(columns=["internal_id"])

@ -95,38 +118,28 @@ with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w")
    f.write(json.dumps(mapping_data.to_dict(orient="records")))

 # We also include some data for visualising the breakdown of EPCS
-proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
 # Invert the true and false
-proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
-proportion_of_real_epcs = proportion_of_real_epcs.rename(
-    columns={"estimated": "is_real_epc"}
-)
+# proportion_of_real_epcs = (~clustering_features["estimated"]).value_counts().to_frame().reset_index()
+# proportion_of_real_epcs = proportion_of_real_epcs.rename(
+#     columns={"estimated": "is_real_epc"}
+# )
+#
+# with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
+#     f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))

-with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
-    f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
+# Produce the breakdown of EPC ratings for properties to be surveyed
+clustering_features["representative_epc"] = clustering_features["representative_sap"].apply(sap_to_epc)

-# Produce the breakdown of EPC ratings
 epc_rating_breakdown = (
-    clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
+    clustering_features[clustering_features["archetype_representative"]]["representative_epc"]
    .value_counts()
    .to_frame()
    .reset_index()
 )

 epc_rating_breakdown = epc_rating_breakdown.rename(
-    columns={"current-energy-rating": "EPC"}
+    columns={"index": "EPC", "representative_epc": "count"}
 )

 with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
    f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
-
-epc_a_properties = clustering_features[
-    (clustering_features["current-energy-rating"] == "A")
-    & (~clustering_features["estimated"])
-    ]
-
-epc_a_properties = epc_a_properties.merge(
-    stonewater_asset_list,
-    on="internal_id",
-    how="inner"
-)
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -13,12 +13,13 @@ import numpy as np
 import pandas as pd
 import time
 from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
-    save_dataframe_to_s3_parquet, save_pickle_to_s3
+    save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from scipy.spatial.distance import cdist
+from sklearn.metrics import pairwise_distances_argmin_min

 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -1083,11 +1084,11 @@ def compile_data():
    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)

    # TODO: Let's store this in s3
-    save_data_to_s3(
-        data=json.dumps(spatial_data_to_uprn.to_dict("records")),
-        s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
-        bucket_name="retrofit-data-dev"
-    )
+    # save_data_to_s3(
+    #     data=json.dumps(spatial_data_to_uprn.to_dict("records")),
+    #     s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
+    #     bucket_name="retrofit-data-dev"
+    # )

    # We merge this spatial data onto final EPCS

@ -1429,17 +1430,17 @@ def compile_data_final():
            older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
    # Store in S3
    # TODO - read in instead of running
-    save_pickle_to_s3(
-        data=epc_data_batch_2,
-        s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
-        bucket_name="retrofit-data-dev"
-    )
-
-    save_pickle_to_s3(
-        data=older_epcs_batch_2,
-        s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
-        bucket_name="retrofit-data-dev"
-    )
+    # save_pickle_to_s3(
+    #     data=epc_data_batch_2,
+    #     s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
+    #     bucket_name="retrofit-data-dev"
+    # )
+    #
+    # save_pickle_to_s3(
+    #     data=older_epcs_batch_2,
+    #     s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
+    #     bucket_name="retrofit-data-dev"
+    # )

    epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
    complete_epcs = pd.concat([epc_data, epc_data_batch_2])
@ -1799,6 +1800,10 @@ def compile_data_final():
        'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
    ]

+    additional_features = [
+
+    ]
+
    # Define the preprocessing for numerical and categorical features
    numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
@ -1957,3 +1962,706 @@ def pull_ideal_postcodes(missing_uprn_with_udprn):
            result["result"]
        )
        completed_id += 1
+
+
+def updated_version():
+    """
+    This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
+    using fewer variables and also factoring in their internal data sources
+
+    This work began on the 23rd July 2024
+    :return:
+    """
+
+    ########################################################################
+    # Read in data
+    ########################################################################
+    asset_list = read_asset_list()
+    asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list)
+
+    # Read in the properties that have been included in Osmosis' wave 2.1
+    osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
+
+    asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)
+
+    # We also check the address & postcode
+    asset_list["In Osmosis Wave 2.1"] = np.where(
+        asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
+        True,
+        asset_list["In Osmosis Wave 2.1"]
+    )
+
+    priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()
+
+    # Pull in the EPC data
+    epc_data = read_epc_data(uprn_lookup_2)
+
+    # Pull in the spatial data to UPRN
+    spatial_data_to_uprn = read_pickle_from_s3(
+        s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+        bucket_name="retrofit-data-dev"
+    )
+
+    # Function to convert specific columns to bool dtype
+    def convert_specific_columns_to_bool(df, columns):
+        for column in columns:
+            if column in df.columns:
+                df[column] = df[column].astype(bool)
+        return df
+
+    spatial_data_to_uprn = [convert_specific_columns_to_bool(
+        df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
+    ) for df in spatial_data_to_uprn]
+
+    spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
+    spatial_data_to_uprn = spatial_data_to_uprn.drop(
+        columns=["partition", "filename"]
+    ).rename(columns={"UPRN": "uprn"})
+    spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
+
+    ########################################################################
+    # Prepare the data
+    ########################################################################
+
+    # Filter the asset list down to the priority postcodes
+    asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)
+
+    master_sheet = master_sheet[
+        master_sheet["Address ID"].isin(
+            asset_list["external_address_id"].values
+        )
+    ]
+
+    master_sheet["days_since_lodgement"] = (
+        datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
+    ).dt.days
+
+    asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
+        master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
+        how="left",
+        left_on="external_address_id",
+        right_on="Address ID"
+    )
+
+    asset_list = asset_list.merge(
+        epc_data[["internal_id", "current-energy-efficiency", "lodgement-date", "estimated"]],
+        how="left",
+        on="internal_id"
+    )
+    asset_list["days_since_lodgement_epc"] = (
+        datetime.now() - pd.to_datetime(asset_list["lodgement-date"], errors="coerce", dayfirst=True)
+    ).dt.days
+
+    # Flag properties that were surveyed within the last 5 years
+    asset_list["epc_within_5_years"] = asset_list["days_since_lodgement_epc"] < 5 * 365
+
+    # Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
+    # a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
+    # the EPC is done
+    asset_list["is_epc_c_or_above"] = (
+        ((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
+        (asset_list["EPC Rating"] >= 80)
+    )
+
+    clustering_features = asset_list[
+        asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] &
+        ~pd.isnull(asset_list["uprn"])
+        ][
+        [
+            "internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2",
+            "city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date",
+            "epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date",
+        ]
+    ]
+
+    # Merge on the SAP data
+    clustering_features = clustering_features.merge(
+        master_sheet[
+            ["Address ID", "SAP"]
+        ].rename(columns={"SAP": "parity_modelled_sap"}),
+        how="left",
+        left_on="external_address_id",
+        right_on="Address ID"
+    )
+
+    # For SAP, we use the most recent EPC if epc_within_5_years is True, otherwise we use the parity modelled sap
+    clustering_features["current-energy-efficiency"] = clustering_features["current-energy-efficiency"].astype(float)
+    clustering_features["representative_sap"] = np.where(
+        clustering_features["epc_within_5_years"],
+        clustering_features["current-energy-efficiency"],
+        clustering_features["parity_modelled_sap"]
+    )
+
+    # We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
+    # is too many
+    clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
+
+    # Merge on spatial features
+    clustering_features = clustering_features.merge(
+        spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
+        how="left",
+        on="uprn"
+    )
+
+    # incorect_epcs = clustering_features[
+    #     clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
+    # incorect_epcs = incorect_epcs[
+    #     ~pd.isnull(incorect_epcs["current-energy-efficiency"]) & pd.isnull(incorect_epcs["estimated"])
+    #     ]
+    # incorect_epcs = incorect_epcs.rename(columns={"current-energy-efficiency": "Current SAP Rating"})
+    # # Store data
+    # incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
+
+    # We add in the key features, which are used for clustering
+    master_sheet_clustering_features = master_sheet[
+        ["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
+    ].copy()
+
+    # Step 1: Remap walls - we end up with 11 types
+    master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
+        {
+            "TimberFrame: AsBuilt": "Other wall type, as built",
+            "SystemBuilt: AsBuilt": "Other wall type, as built",
+            "Sandstone: AsBuilt": "Other wall type, as built",
+            "Sandstone: Internal": "Other wall type, internal or external",
+            "SystemBuilt: External": "Other wall type, internal or external",
+            "GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
+            "TimberFrame: Internal": "Other wall type, internal or external",
+            "Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
+            "SystemBuilt: Internal": "Other wall type, internal or external",
+            "Cavity: Internal": "Other wall type, internal or external",
+        }
+    )
+
+    # Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
+    #         gives us the insulation thickness
+
+    # Clean an incorrect value
+    master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
+        {
+            "PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
+            "PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
+            'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
+            'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
+            'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
+        }
+    )
+
+    master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
+        master_sheet_clustering_features['Roofs'].apply(
+            lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
+        )
+    )
+
+    # Strip any extra whitespace
+    master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
+    master_sheet_clustering_features['roof_insulation_thickness'] = (
+        master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
+    )
+
+    def map_thickness(thickness):
+        try:
+            value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
+            return "Above 250mm" if value > 250 else "Below 250mm"
+        except ValueError:
+            return thickness  # Return the original value if it cannot be converted to a float
+
+    master_sheet_clustering_features['roof_insulation_category'] = (
+        master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
+    )
+
+    # Ideas
+    # 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
+    # as a secondary category
+    # 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
+    # (could split on :, take first part)
+
+    clustering_features = clustering_features.merge(
+        master_sheet_clustering_features,
+        how="left",
+        on="Address ID"
+    )
+
+    # Reduce down to the final set of features we need
+    clustering_features = clustering_features[
+        [
+            "internal_id",
+            "Property Type",
+            # Location
+            "postal_region",
+            'conservation_status',
+            'is_listed_building',
+            'is_heritage_building',
+            "county",
+            # Walls
+            "walls_reduced",
+            # Roof
+            "roof_type",
+            "roof_insulation_category",
+            # Heating
+            "Heating",
+            # Fuel
+            "Main Fuel",
+            "Age",
+            "Total Floor Area",
+            "representative_sap",
+            "days_since_lodgement",
+        ]
+    ]
+
+    clustering_features["days_since_lodgement"] = clustering_features["days_since_lodgement"].fillna(99999)
+
+    def split_property_type(row):
+        parts = row.split(':')
+        property_type = parts[0].strip()
+        built_form = parts[1].strip() if len(parts) > 1 else ''
+        property_extended_feature = parts[2].strip() if len(parts) > 2 else ''
+        return pd.Series([property_type, built_form, property_extended_feature])
+
+    clustering_features[['property_type', 'built_form', 'property_extended_feature']] = (
+        clustering_features['Property Type'].apply(split_property_type)
+    )
+    clustering_features = clustering_features.drop(columns=["Property Type"])
+
+    # These are the variables we MUST split by
+    grouping_columns = [
+        "property_type",
+        "walls_reduced",
+        "roof_type",
+        "Main Fuel",
+        "county",
+    ]
+
+    def combine_small_groups(clustering_features, grouping_columns, threshold=2):
+        # Identify small groups
+        group_sizes = clustering_features.groupby(grouping_columns).size()
+        small_groups = group_sizes[group_sizes <= threshold].index.tolist()
+
+        # Remove small groups from the original clustering_features
+        small_group_data = clustering_features[clustering_features.set_index(grouping_columns).index.isin(small_groups)]
+        clustering_features_ok = clustering_features[
+            ~clustering_features.set_index(grouping_columns).index.isin(small_groups)
+        ]
+
+        if small_group_data.empty:
+            return clustering_features
+
+        # One-Hot Encode categorical variables
+        categorical_features = (
+            clustering_features_ok.drop(columns=["internal_id"])
+            .select_dtypes(include=['object', 'category']).columns.tolist()
+        )
+        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+        ohe.fit(clustering_features_ok[categorical_features])
+
+        # Combine small groups with the nearest available group
+        small_group_ohe = ohe.transform(small_group_data[categorical_features])
+        large_group_ohe = ohe.transform(clustering_features_ok[categorical_features])
+
+        numerical_features = clustering_features_ok.select_dtypes(include=['int64', 'float64']).columns.tolist()
+        small_group_numerical = small_group_data[numerical_features].values
+        large_group_numerical = clustering_features_ok[numerical_features].values
+
+        # Concatenate one-hot encoded categorical and numerical features
+        small_group_features = np.hstack([small_group_ohe, small_group_numerical])
+        large_group_features = np.hstack([large_group_ohe, large_group_numerical])
+
+        # Calculate distances and find nearest groups
+        closest_groups, _ = pairwise_distances_argmin_min(small_group_features, large_group_features)
+        closest_group_index = clustering_features_ok.iloc[closest_groups].index
+
+        # Update small groups to the nearest large group
+        for small_group, closest_group in zip(small_groups, closest_group_index):
+            small_group_mask = small_group_data.set_index(grouping_columns).index == small_group
+            small_group_data.loc[small_group_mask, grouping_columns] = clustering_features_ok.loc[
+                closest_group, grouping_columns].values
+
+        combined_data = pd.concat([clustering_features_ok, small_group_data])
+        return combined_data
+
+    clustering_features_combined = combine_small_groups(clustering_features, grouping_columns)
+
+    ########################################################################
+    # Clustering
+    ########################################################################
+    numerical_features = clustering_features_combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
+    categorical_features = clustering_features_combined.select_dtypes(include=['object', 'category']).columns.tolist()
+    categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
+
+    for col in categorical_features:
+        clustering_features_combined[col] = clustering_features_combined[col].astype(str)
+
+    id_column = 'internal_id'
+    n_clusters = 450
+    random_state = 0
+
+    training_data_grouped = clustering_features_combined.groupby(grouping_columns)
+    group_sizes = {name: len(group) for name, group in training_data_grouped}
+    total_size = sum(group_sizes.values())
+    cluster_allocation = {
+        name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
+    }
+
+    # Adjust cluster allocation to ensure total clusters sum to 450
+    cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
+
+    final_clusters = []
+    for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
+
+        group_n_clusters = cluster_allocation[group_variables]
+        group_data.set_index(id_column, inplace=True)
+
+        preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', StandardScaler(), numerical_features),
+                ('cat', OneHotEncoder(), categorical_features)
+            ]
+        )
+
+        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
+                                   ('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
+
+        # Fit the pipeline to the data
+        pipeline.fit(group_data)
+
+        # Transform the data using the fitted pipeline
+        processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
+
+        # Get cluster labels
+        group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
+
+        # Get centroids (already in the same transformed space)
+        centroids = pipeline.named_steps['kmeans'].cluster_centers_
+
+        # if the data isn't an array, make it one
+        if not isinstance(processed_data, np.ndarray):
+            processed_data = processed_data.toarray()
+
+        # Calculate distances from each point to the centroid of its cluster
+        distances_to_centroids = [
+            cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
+            for i, label in enumerate(group_data['cluster'])
+        ]
+
+        group_data['distance_to_centroid'] = distances_to_centroids
+
+        # Ranking rows by distance within each cluster
+        group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
+
+        # Sorting to verify
+        group_data.sort_values(by=['cluster', 'rank'], inplace=True)
+        group_data.reset_index(inplace=True)
+
+        to_append = group_data[["internal_id", "cluster", "rank"]].copy()
+        to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
+        final_clusters.append(to_append)
+
+    final_clusters = pd.concat(final_clusters)
+    # remap the clusters from the current names to 1 -> n_clusters
+    cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
+    final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
+    final_clusters["cluster"] = final_clusters["cluster"].astype(str)
+
+    assigned_clusters = clustering_features_combined.merge(
+        final_clusters, how="left", on="internal_id"
+    )
+
+    assigned_clusters["archetype_representative"] = assigned_clusters["rank"] == 1
+
+    asset_list_with_archetypes = asset_list.merge(
+        assigned_clusters[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
+        on="internal_id"
+    )
+
+    # We populate the reasons for no archetype
+    # 1) If it's not a priority postcode
+    asset_list_with_archetypes["cluster"] = np.where(
+        ~asset_list_with_archetypes["is_priority_postcode"],
+        "NOT PRIORITY POSTCODE",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # 2) If it's EPC C or above
+    asset_list_with_archetypes["cluster"] = np.where(
+        asset_list_with_archetypes["is_epc_c_or_above"],
+        "EPC C OR ABOVE",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # If it's in Wave 2.1
+    asset_list_with_archetypes["cluster"] = np.where(
+        asset_list_with_archetypes["In Osmosis Wave 2.1"],
+        "IN WAVE 2.1",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    # Has missing uprn
+    asset_list_with_archetypes["cluster"] = np.where(
+        pd.isnull(asset_list_with_archetypes["uprn"]),
+        "MISSING UPRN",
+        asset_list_with_archetypes["cluster"]
+    )
+
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
+    asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
+
+    asset_list_with_archetypes["archetype_representative"] = (
+        asset_list_with_archetypes["archetype_representative"].fillna(False)
+    )
+
+    asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V3.csv", index=False)
+
+    # Produce the archetyping features
+    archetyping_features_csv = assigned_clusters[
+        [
+            "internal_id", "cluster", "archetype_representative", "rank", "conservation_status", "is_listed_building",
+            "is_heritage_building", "postal_region", "county", "representative_sap", "days_since_lodgement"
+        ]
+    ].merge(
+        asset_list[
+            ["internal_id", "uprn", "external_address_id"]
+        ],
+        how="left",
+        on="internal_id"
+    ).merge(
+        master_sheet_clustering_features,
+        how="left",
+        right_on="Address ID",
+        left_on="external_address_id"
+    ).drop(columns=["Address ID"]).rename(
+        columns={
+            "internal_id": "Osm. ID",
+            "external_address_id": "Address ID",
+        }
+    )
+
+    archetyping_features_csv = archetyping_features_csv.sort_values(["cluster", "rank"], ascending=True)
+    archetyping_features_csv.to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater archetyping features V3.csv", index=False
+    )
+
+    representatives = archetyping_features_csv[archetyping_features_csv["archetype_representative"]]
+    print(representatives["postal_region"].nunique())
+    print(representatives["county"].nunique())
+
+
+def read_asset_list():
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+        header=4
+    )
+
+    udprn_data = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
+    )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
+    udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
+    udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
+
+    asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
+    asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
+
+    asset_list = asset_list.rename(
+        columns={
+            "Osm. ID": "internal_id",
+            "Org. ref.": "customer_asset_id",
+            "Postcode": "postcode",
+            "House no": "house_number",
+            "Name": "address1",
+            "Address line 2": "address2",
+            "City/Town": "city_town",
+            "County": "county",
+            "Address ID": "external_address_id",
+            "Owning body": "owner"
+        }
+    )
+
+    asset_list["full_address"] = np.where(
+        ~pd.isnull(asset_list["address2"]),
+        (
+            asset_list["address1"] + ", " +
+            asset_list["address2"] + ", " +
+            asset_list["city_town"].str.title() + ", " +
+            asset_list["postcode"]
+        ),
+        asset_list["address1"] + ", " +
+        asset_list["city_town"].str.title() + ", " +
+        asset_list["postcode"]
+    )
+    return asset_list
+
+
+def merge_uprn_to_asset_list(asset_list):
+    # Read in the lookups
+    uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
+        bucket_name="retrofit-data-dev",
+        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
+    )))
+    uprn_lookup_1["match_type"] = "Exact"
+
+    uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
+        bucket_name="retrofit-data-dev",
+        s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
+    )))
+    uprn_lookup_2 = uprn_lookup_2.rename(
+        columns={
+            "epc_address": "standardised_address",
+            "epc_postcode": "standardised_postcode"
+        }
+    )
+    uprn_lookup_2["match_type"] = "EPC"
+    uprn_lookup_2["uprn"] = np.where(
+        uprn_lookup_2["internal_id"] == 1091,
+        83143766,
+        uprn_lookup_2["uprn"]
+    )
+
+    uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
+        bucket_name="retrofit-data-dev",
+        s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
+    )))
+    uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
+        concatenate_row, axis=1
+    )
+    uprn_lookup_3 = uprn_lookup_3[
+        ["udprn", "uprn", "standardised_address", "postcode"]
+    ].rename(columns={"postcode": "standardised_postcode"})
+    uprn_lookup_3["match_type"] = "Exact"
+
+    uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
+    uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
+    uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
+    # prepare lookup 4
+    uprn_lookup_4 = []
+    for _, x in uprn_lookup_4_basis.iterrows():
+
+        property_type = None
+        built_form = None
+        if x["option"] == 1:
+            uprn = x["os_option_1_uprn"]
+            standardised_address = x["os_option_1_address"]
+            postcode = x["os_option_1_postcode"]
+        elif x["option"] == 2:
+            uprn = x["os_option_2_uprn"]
+            standardised_address = x["os_option_2_address"]
+            postcode = x["os_option_2_address"].split(", ")[-1]
+        else:
+            uprn = x["manual_uprn"]
+            standardised_address = x["manual_address"]
+            postcode = x["manual_postcode"]
+
+        uprn_lookup_4.append(
+            {
+                "internal_id": x["internal_id"],
+                "external_address_id": x["external_address_id"],
+                "uprn": uprn,
+                "standardised_address": standardised_address,
+                "standardised_postcode": postcode,
+                "property_type": property_type,
+                "built_form": built_form
+            }
+        )
+    uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
+    uprn_lookup_4["match_type"] = "Fuzzy"
+
+    # concat
+    uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
+
+    assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
+
+    # Final preps of lookups
+    uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
+    uprn_lookup_3 = uprn_lookup_3.merge(
+        asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
+    )
+    uprn_lookup = pd.concat([
+        uprn_lookup,
+        uprn_lookup_3,
+        uprn_lookup_4
+    ])
+    uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
+
+    asset_list = asset_list.merge(
+        uprn_lookup.drop(columns=["udprn"]),
+        how="inner",
+        on=["internal_id", "external_address_id"]
+    )
+
+    return asset_list, uprn_lookup_2
+
+
+def read_omosis_wave_2_1():
+    osmosis_wave_2_1 = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
+        header=4,
+    )
+    # Remove double spaces from "Name"
+    osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace("  ", " ")
+
+    osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
+    osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
+    # We produce a cleaned list of asset ids from osmosis_wave_2_1
+    osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
+    # We have some ids that are in the form 'id1, id2' so we split them
+    osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]
+
+    return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1
+
+
+def read_stonewater_asset_data():
+    master_sheet = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
+        "sheet.csv",
+        encoding='latin1'
+    )
+
+    master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)
+
+    previous_waves = master_sheet[
+        (master_sheet["In Osmosis W2.1"] == "Yes") |
+        (master_sheet["In Wates Wave 2.1"] == "Yes") |
+        (master_sheet["In Liv Green Wave 2.1"] == "Yes") |
+        (master_sheet["In CCS Wave 2.1"] == "Yes")
+        ].copy()
+
+    previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]
+
+    # We also read the priority postcodes
+    priority_postcodes = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
+        "postcodes.csv",
+        header=17
+    )
+
+    priority_postcodes = priority_postcodes["Postcode"].tolist()
+
+    return priority_postcodes, previous_waves_address_id, master_sheet
+
+
+def read_epc_data(uprn_lookup_2):
+    epc_data = json.loads(
+        read_from_s3(
+            bucket_name="retrofit-data-dev",
+            s3_file_name="customers/Stonewater/clustering/epc_data.json"
+        )
+    )
+    epc_data = pd.DataFrame(epc_data)
+
+    epc_data["uprn"] = np.where(
+        epc_data["internal_id"] == 1091,
+        83143766,
+        epc_data["uprn"]
+    )
+
+    # We drop come EPCS
+    epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
+
+    epc_data_batch_2 = read_pickle_from_s3(
+        s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
+        bucket_name="retrofit-data-dev"
+    )
+    epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
+
+    complete_epcs = pd.concat([epc_data, epc_data_batch_2])
+
+    return complete_epcs
--- a/etl/sfr/epc_average_by_postcode.py
+++ b/etl/sfr/epc_average_by_postcode.py
@ -0,0 +1,80 @@
+import os
+from tqdm import tqdm
+import pandas as pd
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+from backend.app.utils import sap_to_epc
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def app():
+    """
+    This script will retrieve EPC data, for postcodes and produce statistics on the SAP Score
+    :return:
+    """
+
+    source_file = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Addresses - SFR rents.xlsx")
+    source_file["row_id"] = source_file.index
+    # Split out the town, which is the final portion of the string, separated by commas
+    source_file["Town"] = source_file["Address"].apply(lambda x: x.split(" ")[-1].strip() if not pd.isnull(x) else None)
+    source_file["Address"] = source_file["Address"].apply(
+        lambda x: " ".join(x.split(" ")[:-1]).strip() if not pd.isnull(x) else None
+    )
+
+    unique_postcodes = source_file[["Address", "Postcode"]].drop_duplicates()
+
+    # for each postcode, pull EPC data
+    collected_data = []
+    no_data_found = []
+    no_data_after_filters = []
+    for _, config in tqdm(unique_postcodes.iterrows(), total=len(unique_postcodes)):
+        address1 = config["Address"] if not pd.isnull(config["Address"]) else ""
+        searcher = SearchEpc(
+            postcode=config["Postcode"],
+            address1=address1,
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key=""
+        )
+        while True:
+            params = {
+                "postcode": config["Postcode"],
+                "address": address1,
+            }
+            results = searcher.client.domestic.search(params=params, size=10000)
+            if not results:
+                # We strip back address1
+                address1 = " ".join(address1.split(" ")[:-1])
+                if not address1:
+                    break
+            else:
+                break
+
+        if not results:
+            no_data_found.append(config)
+            continue
+
+        data = pd.DataFrame(results["rows"])
+
+        data["current-energy-efficiency"] = data["current-energy-efficiency"].astype(int)
+        # Take EPCs post 2023
+        data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], errors="coerce")
+        data = data[data["lodgement-date"] >= "2023-01-01"]
+        # Take private nrentals
+        data = data[data["tenure"].isin(["rental (private)", "Rented (private)"])]
+
+        if data.empty:
+            no_data_after_filters.append(config)
+            continue
+
+        agg = data.groupby(["property-type", "built-form"])["current-energy-efficiency"].mean().reset_index()
+        agg = agg.rename(columns={"current-energy-efficiency": "Average SAP"})
+        agg["Average EPC"] = agg["Average SAP"].apply(sap_to_epc)
+        agg.insert(0, "Postcode", config["Postcode"])
+        agg.insert(0, "Address", address1)
+
+        collected_data.append(agg)
+
+    collected_df = pd.concat(collected_data)
+    collected_df.to_csv("EPC Averages SFR.csv", index=False)