implementing distance weighting

2026-07-27 23:35:01 +00:00 · 2024-11-17 19:10:23 +00:00 · 2024-11-17 19:10:23 +00:00 · eff80e637f
commit eff80e637f
parent 7d63c16404
1 changed files with 248 additions and 84 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -1635,8 +1635,9 @@ def propsed_wave_3_sample():
        header=4
    )

-    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater
-    asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"]
+    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
+    # UPRN
+    asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])]
    # Clean address ids
    asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
    asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
@ -1648,7 +1649,7 @@ def propsed_wave_3_sample():

    # Keep just the columns we need
    asset_list = asset_list[
-        ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
+        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
         "Heating"]
    ]

@ -1665,7 +1666,7 @@ def propsed_wave_3_sample():
    survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]

    survey_results_with_original_features = survey_results.merge(
-        asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
+        asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
        on="Address ID",
        how="left"
    )
@ -1673,6 +1674,45 @@ def propsed_wave_3_sample():
    if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
        raise ValueError("Something went wrong")

+    # We get longitude & Latitude
+    from utils.s3 import read_pickle_from_s3
+    archetyping_spatial_features = read_pickle_from_s3(
+        bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+    )
+    archetyping_spatial_features = pd.concat(archetyping_spatial_features)
+    archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename(
+        columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"}
+    )
+    # Merge them onto both datasets
+    asset_list = asset_list.merge(
+        archetyping_spatial_features, how="left", on="UPRN"
+    )
+    if pd.isnull(asset_list["longitude"]).sum():
+        raise ValueError("Something went wrong")
+
+    survey_results_with_original_features = survey_results_with_original_features.merge(
+        archetyping_spatial_features, how="left", on="UPRN"
+    )
+    if pd.isnull(survey_results_with_original_features["longitude"]).sum():
+        raise ValueError("Something went wrong")
+
+    def haversine(lat1, lon1, lat2, lon2):
+        # Radius of Earth in meters
+        R = 6371000
+
+        # Convert degrees to radians
+        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
+
+        # Differences
+        dlat = lat2 - lat1
+        dlon = lon2 - lon1
+
+        # Haversine formula
+        a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
+        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
+        distance = R * c
+        return distance
+
    # Tier definitions
    # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
    # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
@ -1716,6 +1756,7 @@ def propsed_wave_3_sample():
            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()

        if region_surveyed["Archetype ID"].duplicated().sum():
+            blah1
            region_surveyed = survey_results[
                survey_results["Archetype ID"].isin(archetypes) &
                (survey_results["Postal Region"] == region)
@ -1755,23 +1796,46 @@ def propsed_wave_3_sample():
            survey_results["Archetype ID"].isin(missed_archetypes)
        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()

-        # TODO - We could average the property?? And call it borderline, call out it was averaged!!!
-        #        We could also find the nearest property to it, with similar wall, roof, heating?
-        #        Can use long/lag to distance calc. We have this data from previous
-
        if archetype_surveyed["Archetype ID"].duplicated().sum():
-            archetype_surveyed = survey_results[
-                survey_results["Archetype ID"].isin(missed_archetypes)
-            ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
-            archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc)
-            archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"])

-        region_assets = region_assets.merge(
-            archetype_surveyed,
-            on="Archetype ID",
-            how="left",
-            suffixes=("", "_method2")
-        )
+            archetype_surveyed = []
+            for arch_id in missed_archetypes:
+                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                    archetype_data = survey_results_with_original_features[
+                        survey_results["Archetype ID"] == arch_id
+                        ].copy()
+                    if archetype_data.empty:
+                        continue
+                    archetype_data["distance_meters"] = haversine(
+                        lat1=property.latitude, lon1=property.longitude,
+                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+                    )
+                    expected_sap = np.average(
+                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+                    )
+                    expected_epc = sap_to_epc(expected_sap)
+                    archetype_surveyed.append(
+                        {
+                            "Archetype ID": arch_id,
+                            "Address ID": property["Address ID"],
+                            "Current EPC Band": expected_epc
+                        }
+                    )
+            archetype_surveyed = pd.DataFrame(archetype_surveyed)
+            region_assets = region_assets.merge(
+                archetype_surveyed,
+                on=["Archetype ID", "Address ID"],
+                how="left",
+                suffixes=("", "_method2")
+            )
+        else:
+
+            region_assets = region_assets.merge(
+                archetype_surveyed,
+                on="Archetype ID",
+                how="left",
+                suffixes=("", "_method2")
+            )

        region_assets["Confidence Tier"] = np.where(
            region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
@ -1792,6 +1856,16 @@ def propsed_wave_3_sample():
            "5 - EPC C or above", region_assets["Confidence Tier"]
        )

+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Archetype ID"] == "EPC C OR ABOVE",
+            "5 - EPC C or above", region_assets["Confidence Tier"]
+        )
+
+        region_assets["Current EPC Band"] = np.where(
+            region_assets["Archetype ID"] == "EPC C OR ABOVE",
+            "C", region_assets["Current EPC Band"]
+        )
+
        missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()

        if not missed_addressids:
@ -1803,17 +1877,10 @@ def propsed_wave_3_sample():
        for a_id in missed_addressids:
            property = asset_list[asset_list["Address ID"] == a_id].squeeze()

-            if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
-                filter_property_types = ["House", "Bungalow"]
-            else:
-                filter_property_types = ["Flat"]
-
-            surveyed_similar = survey_results_with_original_features[
-                (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
+            surveyed = survey_results_with_original_features[
                (
-                    survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    )
+                    survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
+                    property["Property Type"].split(":")[0]
                ) &
                (
                    survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
@ -1827,62 +1894,38 @@ def propsed_wave_3_sample():
                    survey_results_with_original_features["Heating"].str.split(":").str[0] ==
                    property["Heating"].split(":")[0]
                )
-                ]
-            if surveyed_similar.empty:
-                surveyed_similar = survey_results_with_original_features[
-                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    )) &
-                    (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
-                     property["Wall Type"].split(":")[0]) &
-                    (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
-                     property["Roof Type"].split(":")[0]) &
-                    (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
-                     property["Heating"].split(":")[0])
-                    ]
+                ].copy()

-            if surveyed_similar.empty:
+            if surveyed.empty:
+                blah3

-                # We get an average based on the postcode
-                surveyed_similar = survey_results_with_original_features[
-                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    ))
-                    ]
-                if surveyed_similar.empty:
-                    final_missed_matches.append(
-                        {
-                            "Address ID": a_id,
-                            "Confidence Tier": "4 - no similar property, needs survey to confirm",
-                            "Current EPC Band": "Unknown"
-                        }
+            # Calculate distance
+            surveyed["distance_meters"] = haversine(
+                lat1=property["latitude"], lon1=property["longitude"],
+                lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values
+            )
+            surveyed = surveyed.sort_values("distance_meters", ascending=True)

-                    )
-                else:
-                    expected_sap = surveyed_similar["Current SAP Rating"].mean()
-                    expected_epc = sap_to_epc(expected_sap)
-                    if expected_epc in ["C", "B", "A"]:
-                        tier = "5 - EPC C or above"
-                    else:
-                        tier = "3 - similar property, relaxed conditions"
+            # Check if we have a postcode match check if surveyed postcode is the same as the property postcode
+            if any(surveyed["Postcode"] == property["Postcode"]):
+                surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]]

-                    final_missed_matches.append(
-                        {
-                            "Address ID": a_id,
-                            "Confidence Tier": tier,
-                            "Current EPC Band": expected_epc
-                        }
-                    )
-                continue
-            # We take an average
-            expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            if any(surveyed["Postal Region"] == property["Postal Region"]):
+                surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
+
+            # Take the 5 nearest
+            surveyed_similar = surveyed_similar.head(5)
+
+            # perform a weighted mean of SAP rating - the closer the better
+            expected_sap = np.average(
+                surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1)
+            )
            expected_epc = sap_to_epc(expected_sap)
+
            if expected_epc in ["C", "B", "A"]:
                tier = "5 - EPC C or above"
            else:
-                tier = "3 - similar property"
+                tier = "3 - similar property, weighted on distance"

            final_missed_matches.append(
                {
@ -1891,6 +1934,121 @@ def propsed_wave_3_sample():
                    "Current EPC Band": expected_epc
                }
            )
+            continue
+
+            # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
+            #     filter_property_types = ["House", "Bungalow"]
+            # else:
+            #     filter_property_types = ["Flat"]
+            #
+            # surveyed_similar = survey_results_with_original_features[
+            #     (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
+            #     (
+            #         survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         )
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #         property["Wall Type"].split(":")[0]
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #         property["Roof Type"].split(":")[0]
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #         property["Heating"].split(":")[0]
+            #     )
+            #     ]
+            # if surveyed_similar.empty:
+            #     surveyed_similar = survey_results_with_original_features[
+            #         (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+            #         (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         )) &
+            #         (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #          property["Wall Type"].split(":")[0]) &
+            #         (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #          property["Roof Type"].split(":")[0]) &
+            #         (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #          property["Heating"].split(":")[0])
+            #         ]
+            #
+            # if surveyed_similar.empty:
+            #
+            #     # We get an average based on the postcode
+            #     surveyed_similar = survey_results_with_original_features[
+            #         (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+            #         (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         ))
+            #         ]
+            #     if surveyed_similar.empty:
+            #         surveyed_similar_entire_population = survey_results_with_original_features[
+            #             (
+            #                 survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[
+            #                 "Property Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #                 property["Wall Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #                 property["Roof Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #                 property["Heating"].split(":")[0]
+            #             )
+            #             ]
+            #
+            #         # We order them by distance on postcode
+            #
+            #         # Average
+            #         expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean()
+            #         expected_epc = sap_to_epc(expected_sap)
+            #
+            #         final_missed_matches.append(
+            #             {
+            #                 "Address ID": a_id,
+            #                 "Confidence Tier": "3 - similar property, all areas searched",
+            #                 "Current EPC Band": expected_epc
+            #             }
+            #
+            #         )
+            #     else:
+            #         expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            #         expected_epc = sap_to_epc(expected_sap)
+            #         if expected_epc in ["C", "B", "A"]:
+            #             tier = "5 - EPC C or above"
+            #         else:
+            #             tier = "3 - similar property, relaxed conditions"
+            #
+            #         final_missed_matches.append(
+            #             {
+            #                 "Address ID": a_id,
+            #                 "Confidence Tier": tier,
+            #                 "Current EPC Band": expected_epc
+            #             }
+            #         )
+            #     continue
+            # # We take an average
+            # expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            # expected_epc = sap_to_epc(expected_sap)
+            # if expected_epc in ["C", "B", "A"]:
+            #     tier = "5 - EPC C or above"
+            # else:
+            #     tier = "3 - similar property"
+            #
+            # final_missed_matches.append(
+            #     {
+            #         "Address ID": a_id,
+            #         "Confidence Tier": tier,
+            #         "Current EPC Band": expected_epc
+            #     }
+            # )

        final_missed_matches = pd.DataFrame(final_missed_matches)

@ -1928,27 +2086,33 @@ def propsed_wave_3_sample():

    # We create the gain and loss columns
    # Gain is the sum of these columns:
-    # '1 - Archetype surveyed', '1 - property was surveyed',
-    #        '2 - same archetype', '3 - similar property',
+    # '1 - Archetype surveyed',
+    # '1 - property was surveyed',
+    # '2 - same archetype',
+    # '3 - similar property',
+    # '3 - similar property, all areas searched',
+    # '3 - similar property, relaxed conditions'
+    #
    # Loss is the sum of these columns:
    # '4 - no similar property, needs survey to confirm',
    # '5 - EPC C or above', '5 - property was surveyed'
    geographic_summary["Gain"] = geographic_summary[
-        ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property']
+        [
+            '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property',
+            '3 - similar property, all areas searched', '3 - similar property, relaxed conditions'
+        ]
    ].sum(axis=1)

    geographic_summary["Loss"] = geographic_summary[
-        ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed']
+        ['5 - EPC C or above', '5 - property was surveyed']
    ].sum(axis=1)

-    geographic_summary.sum()
+    print(geographic_summary.sum())

    geographic_summary = geographic_summary.sort_values("Loss", ascending=True)
    geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
    geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()

-    geographic_summary[["Loss", "Gain"]].head()
-
    loss = geographic_summary["Loss"].values
    gain = geographic_summary["Gain"].values