messing around with street match

2026-07-27 23:35:01 +00:00 · 2024-11-17 22:33:42 +00:00 · 2024-11-17 22:33:42 +00:00 · 67f97feb18
commit 67f97feb18
parent 1b38832e27
1 changed files with 74 additions and 31 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -1637,7 +1637,7 @@ def propsed_wave_3_sample():

    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
    # UPRN
-    asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])]
+    asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])]
    # Clean address ids
    asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
    asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
@ -1645,12 +1645,13 @@ def propsed_wave_3_sample():

    # Create the postal region, taking the first part of the postcode
    asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0]
+    asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"]
    unique_postal_regions = asset_list["Postal Region"].unique()

    # Keep just the columns we need
    asset_list = asset_list[
-        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
-         "Heating"]
+        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region",
+         "Property Type", "Wall Type", "Roof Type", "Heating"]
    ]

    survey_results = pd.read_excel(
@ -1853,7 +1854,6 @@ def propsed_wave_3_sample():
                suffixes=("", "_method2")
            )
        else:
-
            region_assets = region_assets.merge(
                archetype_surveyed,
                on="Archetype ID",
@ -1903,20 +1903,20 @@ def propsed_wave_3_sample():

            surveyed = survey_results_with_original_features[
                (
-                    survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
-                    property["Property Type"].split(":")[0]
+                    survey_results_with_original_features["Property Type"] ==
+                    property["Property Type"]
                ) &
                (
-                    survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
-                    property["Wall Type"].split(":")[0]
+                    survey_results_with_original_features["Wall Type"] ==
+                    property["Wall Type"]
                ) &
                (
-                    survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
-                    property["Roof Type"].split(":")[0]
+                    survey_results_with_original_features["Roof Type"] ==
+                    property["Roof Type"]
                ) &
                (
-                    survey_results_with_original_features["Heating"].str.split(":").str[0] ==
-                    property["Heating"].split(":")[0]
+                    survey_results_with_original_features["Heating"] ==
+                    property["Heating"]
                )
                ].copy()

@ -1962,7 +1962,10 @@ def propsed_wave_3_sample():
                if "Electric" in property["Heating"]:
                    # Take other electric heating systems
                    surveyed = surveyed[surveyed["Heating"].str.contains("Electric")]
-                elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)":
+                elif property["Heating"] in [
+                    "Community Heating Systems: Community boilers only (RdSAP)",
+                    "Community Heating Systems: Community CHP and boilers (RdSAP)"
+                ]:
                    # Take other community heating systems
                    surveyed = surveyed[surveyed["Heating"].str.contains("Community")]
                elif property["Heating"] == 'Heat Pump: (from database)':
@ -2001,8 +2004,8 @@ def propsed_wave_3_sample():
            if any(surveyed["Postal Region"] == property["Postal Region"]):
                surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]]

-            # Take the 5 nearest
-            surveyed = surveyed.head(5)
+            # Take the 3 nearest
+            surveyed = surveyed.head(3)

            # # We allow a max distance of 10km
            # surveyed = surveyed[surveyed["distance_meters"] < 10000]
@ -2176,6 +2179,9 @@ def propsed_wave_3_sample():

    results = pd.concat(results)

+    # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1)
+    # region = home["Postal Region"].values[0]
+
    # Create a pivot table for counts of Confidence Tier by Postal Region
    geographic_summary = results.pivot_table(
        index='Postal Region',
@ -2192,7 +2198,9 @@ def propsed_wave_3_sample():
    # '3 - similar property, weighted on distance'

    gain_columns = [
-        '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype',
+        '1 - Archetype surveyed',
+        '1 - property was surveyed',
+        '2 - same archetype',
        '3 - similar property, weighted on distance'
    ]
    #
@ -2200,8 +2208,11 @@ def propsed_wave_3_sample():
    # '4 - no similar property, needs survey to confirm',
    # '5 - EPC C or above', '5 - property was surveyed'

-    loss_columns = ['4 - no similar property, needs survey to confirm', '5 - EPC C or above',
-                    '5 - property was surveyed']
+    loss_columns = [
+        '4 - no similar property, needs survey to confirm',
+        '5 - EPC C or above',
+        '5 - property was surveyed'
+    ]
    geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1)
    geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1)

@ -2249,26 +2260,30 @@ def propsed_wave_3_sample():
    # We now see if there are any postcodes that have no loss that can be added
    unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values

+    # TODO: Try on street
+
    postcode_summary = results.pivot_table(
-        index='Postcode',
+        index='Street and Region',
        columns='Confidence Tier',
        aggfunc='size',
        fill_value=0
    ).reset_index()
-    postcode_summary = postcode_summary.merge(
-        results[["Postcode", "Postal Region"]].drop_duplicates(),
-        how="left", on="Postcode"
-    )
-
-    postcode_summary_unselected_regions = postcode_summary[
-        postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions)
-    ].copy()
+    # postcode_summary = postcode_summary.merge(
+    #     results[["Postcode", "Postal Region"]].drop_duplicates(),
+    #     how="left", on="Postcode"
+    # )
+    #
+    postcode_summary_unselected_regions = postcode_summary.copy()
+    # postcode_summary_unselected_regions = postcode_summary[
+    #     postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions)
+    # ].copy()

    postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1)
    postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1)

    # Remaining loss allowed
-    remaining_loss_constraint = 250 - region_totals["Loss"]
+    # remaining_loss_constraint = 230 - region_totals["Loss"]
+    remaining_loss_constraint = 250
    postcode_selected_rows, _ = optimise(
        gain=postcode_summary_unselected_regions["Gain"].values,
        loss=postcode_summary_unselected_regions["Loss"].values,
@ -2284,12 +2299,40 @@ def propsed_wave_3_sample():

    postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum()

-    bid_size = region_totals.sum() + postcode_totals.sum()
+    bid_size = postcode_totals.sum()
    print("Bid Size:", bid_size)
-    total_epc_d_or_below = region_totals["Gain"] + postcode_totals["Gain"]
+    total_epc_d_or_below = postcode_totals["Gain"]
    print("Total EPC D or below:", total_epc_d_or_below)
-    total_epc_c = region_totals["Loss"] + postcode_totals["Loss"]
+    total_epc_c = postcode_totals["Loss"]
    print("Total EPC C or above:", total_epc_c)
+    # Total needing a survey
+    total_needing_survey = postcode_optimised_additional_properties[
+        "4 - no similar property, needs survey to confirm"
+    ].sum()
+    print("Total needing survey:", total_needing_survey)
+
+    # Look for postcodes that have no loss
+    unselected_streets = postcode_summary_unselected_regions[
+        ~postcode_summary_unselected_regions["Selected"]
+    ]["Street and Region"].values
+
+    postcode_summary2 = results[
+        results["Street and Region"].isin(unselected_streets)
+    ].pivot_table(
+        index='Postcode',
+        columns='Confidence Tier',
+        aggfunc='size',
+        fill_value=0
+    ).reset_index()
+
+    postcode_summary2["Gain"] = postcode_summary2[gain_columns].sum(axis=1)
+    postcode_summary2["Loss"] = postcode_summary2[loss_columns].sum(axis=1)
+
+    no_loss_postcodes = postcode_summary2[postcode_summary2["Loss"] == 0].sort_values("Gain", ascending=False)
+    total_bid_size = bid_size + no_loss_postcodes["Gain"].sum()
+    print(total_bid_size)
+
+    z = results[results["Confidence Tier"] == "5 - EPC C or above"]

 # if __name__ == "__main__":
 #     main()