debugging stonewater algorithm

2026-07-27 23:35:01 +00:00 · 2024-11-17 15:16:54 +00:00 · 2024-11-17 15:16:54 +00:00 · d00c291c17
commit d00c291c17
parent 4d021f0ba6
1 changed files with 25 additions and 43 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -1716,20 +1716,11 @@ def propsed_wave_3_sample():
            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()

        if region_surveyed["Archetype ID"].duplicated().sum():
-            # Take the duplicated archetypes
-            duplicated_archetypes = region_surveyed[
-                region_surveyed["Archetype ID"].duplicated()
-            ]["Archetype ID"].unique()
-            duplicated_archetypes = region_surveyed[
-                region_surveyed["Archetype ID"].isin(duplicated_archetypes)
-            ]
-
-            # We need to select which one is the most relevant to these properties
-            survey_data = survey_results_with_original_features[
-                survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values)
-            ]
-
-            raise NotImplementedError("Fix me")
+            region_surveyed = survey_results[
+                survey_results["Archetype ID"].isin(archetypes) &
+                (survey_results["Postal Region"] == region)
+                ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
+            region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc)

        region_assets = region_assets.merge(
            region_surveyed,
@ -1744,6 +1735,17 @@ def propsed_wave_3_sample():
            pd.isnull(region_assets["Confidence Tier"]),
            "1 - Archetype surveyed", region_assets["Confidence Tier"]
        )
+
+        region_assets["Current EPC Band"] = np.where(
+            pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]),
+            region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"]
+        )
+        # Handle EPC C
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
+            "6 - EPC C or above", region_assets["Confidence Tier"]
+        )
+
        region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
        # TODO: Turn into a function
        missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
@ -1752,36 +1754,16 @@ def propsed_wave_3_sample():
            survey_results["Archetype ID"].isin(missed_archetypes)
        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()

+        # TODO - We could average the property?? And call it borderline, call out it was averaged!!!
+        #        We could also find the nearest property to it, with similar wall, roof, heating?
+        #        Can use long/lag to distance calc. We have this data from previous
+
        if archetype_surveyed["Archetype ID"].duplicated().sum():
-            # We need to select which one is the most relevant to these properties
-            duplicated_archetypes = archetype_surveyed[
-                archetype_surveyed["Archetype ID"].duplicated()
-            ]["Archetype ID"].unique()
-
-            survey_data = survey_results_with_original_features[
-                survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes)
-            ]
-
-            homes_with_these_archetypes = region_assets[
-                region_assets["Archetype ID"].isin(duplicated_archetypes)
-            ]
-
-            for _, home in homes_with_these_archetypes.iterrows():
-                first_filter = survey_data[
-                    (survey_data["Postal Region"] == home["Postal Region"]) &
-                    (survey_data["Property Type"] == home["Property Type"]) &
-                    (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
-                    ]
-
-                if not first_filter.empty:
-                    NotImplementedError("Fix me 0")
-
-                second_filter = survey_data[
-                    (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) &
-                    (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
-                    ]
-
-            raise NotImplementedError("Fix me 2")
+            archetype_surveyed = survey_results[
+                survey_results["Archetype ID"].isin(missed_archetypes)
+            ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
+            archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc)
+            archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"])

        region_assets = region_assets.merge(
            archetype_surveyed,