diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index c397f962..3b44d560 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1635,8 +1635,9 @@ def propsed_wave_3_sample():
         header=4
     )
 
-    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater
-    asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"]
+    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
+    # UPRN
+    asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])]
     # Clean address ids
     asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
     asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
@@ -1648,7 +1649,7 @@ def propsed_wave_3_sample():
 
     # Keep just the columns we need
     asset_list = asset_list[
-        ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
+        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
          "Heating"]
     ]
 
@@ -1665,7 +1666,7 @@ def propsed_wave_3_sample():
     survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
 
     survey_results_with_original_features = survey_results.merge(
-        asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
+        asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
         on="Address ID",
         how="left"
     )
@@ -1673,6 +1674,45 @@ def propsed_wave_3_sample():
     if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
         raise ValueError("Something went wrong")
 
+    # We get longitude & Latitude
+    from utils.s3 import read_pickle_from_s3
+    archetyping_spatial_features = read_pickle_from_s3(
+        bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+    )
+    archetyping_spatial_features = pd.concat(archetyping_spatial_features)
+    archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename(
+        columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"}
+    )
+    # Merge them onto both datasets
+    asset_list = asset_list.merge(
+        archetyping_spatial_features, how="left", on="UPRN"
+    )
+    if pd.isnull(asset_list["longitude"]).sum():
+        raise ValueError("Something went wrong")
+
+    survey_results_with_original_features = survey_results_with_original_features.merge(
+        archetyping_spatial_features, how="left", on="UPRN"
+    )
+    if pd.isnull(survey_results_with_original_features["longitude"]).sum():
+        raise ValueError("Something went wrong")
+
+    def haversine(lat1, lon1, lat2, lon2):
+        # Radius of Earth in meters
+        R = 6371000
+
+        # Convert degrees to radians
+        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
+
+        # Differences
+        dlat = lat2 - lat1
+        dlon = lon2 - lon1
+
+        # Haversine formula
+        a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
+        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
+        distance = R * c
+        return distance
+
     # Tier definitions
     # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
     # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
@@ -1716,6 +1756,7 @@ def propsed_wave_3_sample():
             ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
         if region_surveyed["Archetype ID"].duplicated().sum():
+            blah1
             region_surveyed = survey_results[
                 survey_results["Archetype ID"].isin(archetypes) &
                 (survey_results["Postal Region"] == region)
@@ -1755,23 +1796,46 @@ def propsed_wave_3_sample():
             survey_results["Archetype ID"].isin(missed_archetypes)
         ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
-        # TODO - We could average the property?? And call it borderline, call out it was averaged!!!
-        #        We could also find the nearest property to it, with similar wall, roof, heating?
-        #        Can use long/lag to distance calc. We have this data from previous
-
         if archetype_surveyed["Archetype ID"].duplicated().sum():
-            archetype_surveyed = survey_results[
-                survey_results["Archetype ID"].isin(missed_archetypes)
-            ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
-            archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc)
-            archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"])
 
-        region_assets = region_assets.merge(
-            archetype_surveyed,
-            on="Archetype ID",
-            how="left",
-            suffixes=("", "_method2")
-        )
+            archetype_surveyed = []
+            for arch_id in missed_archetypes:
+                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                    archetype_data = survey_results_with_original_features[
+                        survey_results["Archetype ID"] == arch_id
+                        ].copy()
+                    if archetype_data.empty:
+                        continue
+                    archetype_data["distance_meters"] = haversine(
+                        lat1=property.latitude, lon1=property.longitude,
+                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+                    )
+                    expected_sap = np.average(
+                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+                    )
+                    expected_epc = sap_to_epc(expected_sap)
+                    archetype_surveyed.append(
+                        {
+                            "Archetype ID": arch_id,
+                            "Address ID": property["Address ID"],
+                            "Current EPC Band": expected_epc
+                        }
+                    )
+            archetype_surveyed = pd.DataFrame(archetype_surveyed)
+            region_assets = region_assets.merge(
+                archetype_surveyed,
+                on=["Archetype ID", "Address ID"],
+                how="left",
+                suffixes=("", "_method2")
+            )
+        else:
+
+            region_assets = region_assets.merge(
+                archetype_surveyed,
+                on="Archetype ID",
+                how="left",
+                suffixes=("", "_method2")
+            )
 
         region_assets["Confidence Tier"] = np.where(
             region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
@@ -1792,6 +1856,16 @@ def propsed_wave_3_sample():
             "5 - EPC C or above", region_assets["Confidence Tier"]
         )
 
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Archetype ID"] == "EPC C OR ABOVE",
+            "5 - EPC C or above", region_assets["Confidence Tier"]
+        )
+
+        region_assets["Current EPC Band"] = np.where(
+            region_assets["Archetype ID"] == "EPC C OR ABOVE",
+            "C", region_assets["Current EPC Band"]
+        )
+
         missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
 
         if not missed_addressids:
@@ -1803,17 +1877,10 @@ def propsed_wave_3_sample():
         for a_id in missed_addressids:
             property = asset_list[asset_list["Address ID"] == a_id].squeeze()
 
-            if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
-                filter_property_types = ["House", "Bungalow"]
-            else:
-                filter_property_types = ["Flat"]
-
-            surveyed_similar = survey_results_with_original_features[
-                (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
+            surveyed = survey_results_with_original_features[
                 (
-                    survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    )
+                    survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
+                    property["Property Type"].split(":")[0]
                 ) &
                 (
                     survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
@@ -1827,62 +1894,38 @@ def propsed_wave_3_sample():
                     survey_results_with_original_features["Heating"].str.split(":").str[0] ==
                     property["Heating"].split(":")[0]
                 )
-                ]
-            if surveyed_similar.empty:
-                surveyed_similar = survey_results_with_original_features[
-                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    )) &
-                    (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
-                     property["Wall Type"].split(":")[0]) &
-                    (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
-                     property["Roof Type"].split(":")[0]) &
-                    (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
-                     property["Heating"].split(":")[0])
-                    ]
+                ].copy()
 
-            if surveyed_similar.empty:
+            if surveyed.empty:
+                blah3
 
-                # We get an average based on the postcode
-                surveyed_similar = survey_results_with_original_features[
-                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    ))
-                    ]
-                if surveyed_similar.empty:
-                    final_missed_matches.append(
-                        {
-                            "Address ID": a_id,
-                            "Confidence Tier": "4 - no similar property, needs survey to confirm",
-                            "Current EPC Band": "Unknown"
-                        }
+            # Calculate distance
+            surveyed["distance_meters"] = haversine(
+                lat1=property["latitude"], lon1=property["longitude"],
+                lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values
+            )
+            surveyed = surveyed.sort_values("distance_meters", ascending=True)
 
-                    )
-                else:
-                    expected_sap = surveyed_similar["Current SAP Rating"].mean()
-                    expected_epc = sap_to_epc(expected_sap)
-                    if expected_epc in ["C", "B", "A"]:
-                        tier = "5 - EPC C or above"
-                    else:
-                        tier = "3 - similar property, relaxed conditions"
+            # Check if we have a postcode match check if surveyed postcode is the same as the property postcode
+            if any(surveyed["Postcode"] == property["Postcode"]):
+                surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]]
 
-                    final_missed_matches.append(
-                        {
-                            "Address ID": a_id,
-                            "Confidence Tier": tier,
-                            "Current EPC Band": expected_epc
-                        }
-                    )
-                continue
-            # We take an average
-            expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            if any(surveyed["Postal Region"] == property["Postal Region"]):
+                surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
+
+            # Take the 5 nearest
+            surveyed_similar = surveyed_similar.head(5)
+
+            # perform a weighted mean of SAP rating - the closer the better
+            expected_sap = np.average(
+                surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1)
+            )
             expected_epc = sap_to_epc(expected_sap)
+
             if expected_epc in ["C", "B", "A"]:
                 tier = "5 - EPC C or above"
             else:
-                tier = "3 - similar property"
+                tier = "3 - similar property, weighted on distance"
 
             final_missed_matches.append(
                 {
@@ -1891,6 +1934,121 @@ def propsed_wave_3_sample():
                     "Current EPC Band": expected_epc
                 }
             )
+            continue
+
+            # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
+            #     filter_property_types = ["House", "Bungalow"]
+            # else:
+            #     filter_property_types = ["Flat"]
+            #
+            # surveyed_similar = survey_results_with_original_features[
+            #     (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
+            #     (
+            #         survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         )
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #         property["Wall Type"].split(":")[0]
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #         property["Roof Type"].split(":")[0]
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #         property["Heating"].split(":")[0]
+            #     )
+            #     ]
+            # if surveyed_similar.empty:
+            #     surveyed_similar = survey_results_with_original_features[
+            #         (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+            #         (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         )) &
+            #         (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #          property["Wall Type"].split(":")[0]) &
+            #         (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #          property["Roof Type"].split(":")[0]) &
+            #         (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #          property["Heating"].split(":")[0])
+            #         ]
+            #
+            # if surveyed_similar.empty:
+            #
+            #     # We get an average based on the postcode
+            #     surveyed_similar = survey_results_with_original_features[
+            #         (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+            #         (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         ))
+            #         ]
+            #     if surveyed_similar.empty:
+            #         surveyed_similar_entire_population = survey_results_with_original_features[
+            #             (
+            #                 survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[
+            #                 "Property Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #                 property["Wall Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #                 property["Roof Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #                 property["Heating"].split(":")[0]
+            #             )
+            #             ]
+            #
+            #         # We order them by distance on postcode
+            #
+            #         # Average
+            #         expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean()
+            #         expected_epc = sap_to_epc(expected_sap)
+            #
+            #         final_missed_matches.append(
+            #             {
+            #                 "Address ID": a_id,
+            #                 "Confidence Tier": "3 - similar property, all areas searched",
+            #                 "Current EPC Band": expected_epc
+            #             }
+            #
+            #         )
+            #     else:
+            #         expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            #         expected_epc = sap_to_epc(expected_sap)
+            #         if expected_epc in ["C", "B", "A"]:
+            #             tier = "5 - EPC C or above"
+            #         else:
+            #             tier = "3 - similar property, relaxed conditions"
+            #
+            #         final_missed_matches.append(
+            #             {
+            #                 "Address ID": a_id,
+            #                 "Confidence Tier": tier,
+            #                 "Current EPC Band": expected_epc
+            #             }
+            #         )
+            #     continue
+            # # We take an average
+            # expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            # expected_epc = sap_to_epc(expected_sap)
+            # if expected_epc in ["C", "B", "A"]:
+            #     tier = "5 - EPC C or above"
+            # else:
+            #     tier = "3 - similar property"
+            #
+            # final_missed_matches.append(
+            #     {
+            #         "Address ID": a_id,
+            #         "Confidence Tier": tier,
+            #         "Current EPC Band": expected_epc
+            #     }
+            # )
 
         final_missed_matches = pd.DataFrame(final_missed_matches)
 
@@ -1928,27 +2086,33 @@ def propsed_wave_3_sample():
 
     # We create the gain and loss columns
     # Gain is the sum of these columns:
-    # '1 - Archetype surveyed', '1 - property was surveyed',
-    #        '2 - same archetype', '3 - similar property',
+    # '1 - Archetype surveyed',
+    # '1 - property was surveyed',
+    # '2 - same archetype',
+    # '3 - similar property',
+    # '3 - similar property, all areas searched',
+    # '3 - similar property, relaxed conditions'
+    #
     # Loss is the sum of these columns:
     # '4 - no similar property, needs survey to confirm',
     # '5 - EPC C or above', '5 - property was surveyed'
     geographic_summary["Gain"] = geographic_summary[
-        ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property']
+        [
+            '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property',
+            '3 - similar property, all areas searched', '3 - similar property, relaxed conditions'
+        ]
     ].sum(axis=1)
 
     geographic_summary["Loss"] = geographic_summary[
-        ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed']
+        ['5 - EPC C or above', '5 - property was surveyed']
     ].sum(axis=1)
 
-    geographic_summary.sum()
+    print(geographic_summary.sum())
 
     geographic_summary = geographic_summary.sort_values("Loss", ascending=True)
     geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
     geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()
 
-    geographic_summary[["Loss", "Gain"]].head()
-
     loss = geographic_summary["Loss"].values
     gain = geographic_summary["Gain"].values