From 10bc433283417a2c15ffe2924537ded81af240d6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 3 Feb 2025 16:06:47 +0000
Subject: [PATCH] assigning properties to bands

---
 .../stonewater/Wave 3 Preparation.py          | 71 ++++++++++++++++---
 1 file changed, 62 insertions(+), 9 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 04078e47..c623e9f7 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3212,10 +3212,10 @@ def revised_model():
         "12  Ashcroft Close": 26399
     }
     for name, asset_id in missing_lookup.items():
-        wates_coordination["Asset ID_x"] = np.where(
+        wates_coordination["Asset ID"] = np.where(
             wates_coordination["Name"] == name,
             asset_id,
-            wates_coordination["Asset ID_x"]
+            wates_coordination["Asset ID"]
         )
 
     wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])]
@@ -3596,6 +3596,16 @@ def revised_model():
         matching_lookup, how="left", on="Name"
     )
 
+    # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board
+    to_remove = wates_coordination[
+        wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
+    ]
+    assert to_remove.shape[0] == 4
+    # Remove them from the wates board
+    wates_coordination = wates_coordination[
+        ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
+    ]
+
     # We combine this into a singular board
     coordinated_packages = pd.concat(
         [
@@ -3662,6 +3672,7 @@ def revised_model():
     )
 
     coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int)
+    assert not coordinated_packages["Organisation Reference"].duplicated().sum()
 
     # Merge the property features on
     coordinated_packages = coordinated_packages.merge(
@@ -3670,6 +3681,25 @@ def revised_model():
         on="Organisation Reference"
     )
 
+    # We match the properties to their closest match
+    # We clean up the SAP ratings in the coordinated packages
+    def sap_to_number(x):
+        try:
+            return int(x)
+        except:
+            if x[-1] in ["A", "B", "C", "D", "E", "F"]:
+                return int(x[:-1])
+
+            if x[0] in ["A", "B", "C", "D", "E", "F"]:
+                return int(x[1:])
+
+    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Band"])]
+    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Rating"])]
+
+    coordinated_packages["Actual SAP Rating"] = coordinated_packages["Actual SAP Rating"].apply(
+        lambda x: sap_to_number(x)
+    )
+
     # We need the features pertaining to these priority postcodes
 
     def find_nearest_matching_property(coordinated_packages, home):
@@ -3729,11 +3759,9 @@ def revised_model():
     no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False)
 
     # len(no_match)
-    # 8764, 5607
+    # 8764, 5607, 5646
     # no_match_summary.shape
-    # (3953, 6), (2948, 6)
-
-    # We match the properties to their closest match
+    # (3953, 6), (2948, 6), (2969, 7)
 
     matches_df = pd.DataFrame(matches)
     matches_df = matches_df.merge(
@@ -3745,11 +3773,36 @@ def revised_model():
     aggregated_matches_df = []
     for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
         if mapped_matches.shape[0] == 1:
-            mapped_matches["Number of matches"] = 1
-            mapped_matches["Proportion"]
-            aggregated_matches_df.append(mapped_matches)
+            aggregated_matches_df.append(
+                {
+                    "Organisation Reference": org_ref,
+                    "Number of matches": 1,
+                    "Proportion": 100,
+                    "Estimated SAP Rating": mapped_matches["Actual SAP Rating"].values[0],
+                    "Estimated EPC Rating": sap_to_epc(mapped_matches["Actual SAP Rating"].values[0])
+                }
+            )
             continue
 
+        # We need to aggregate the matches, since we have multiple
+        average_rating = mapped_matches["Actual SAP Rating"].mean()
+        number_of_matches = mapped_matches.shape[0]
+        average_epc_rating = sap_to_epc(average_rating)
+        # proportion is the number of properties that have this EPC rating
+        proportion_with_this_epc = int(
+            mapped_matches[mapped_matches["Actual SAP Band"] == average_epc_rating].shape[0] / number_of_matches * 100)
+        aggregated_matches_df.append(
+            {
+                "Organisation Reference": org_ref,
+                "Number of matches": number_of_matches,
+                "Proportion": proportion_with_this_epc,
+                "Estimated SAP Rating": average_rating,
+                "Estimated EPC Rating": average_epc_rating
+            }
+        )
+
+    aggregated_matches_df = pd.DataFrame(aggregated_matches_df)
+
     mapped_priority_list = new_priority_postcodes.merge(
         matches_df, on="Organisation Reference",
     )