From 4d021f0ba6a5894659275d8090e1f65be6ca68f6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 16 Nov 2024 17:12:55 +0000
Subject: [PATCH] working on stonewater alg

---
 .../stonewater/Wave 3 Preparation.py          | 102 +++++++++++++++---
 1 file changed, 86 insertions(+), 16 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index ef7dd414..40dfd38e 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3,6 +3,7 @@ import PyPDF2
 import re
 import pandas as pd
 import numpy as np
+from docutils.utils.math.tex2mathml_extern import blahtexml
 from tqdm import tqdm
 from collections import Counter
 
@@ -1681,19 +1682,15 @@ def propsed_wave_3_sample():
     for region in tqdm(unique_postal_regions):
         # Take all of the properties in that region
         region_assets = asset_list[asset_list["Postal Region"] == region].copy()
-        archetypes = region_assets["Archetype ID"].unique()
-        # We get the properties that have been surveyed
-        region_surveyed = survey_results[
-            survey_results["Archetype ID"].isin(archetypes) &
-            (survey_results["Postal Region"] == region)
-            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
-        if region_surveyed["Archetype ID"].duplicated().sum():
-            raise NotImplementedError("Fix me")
+        # We have a tier 1 match if the property itself was surveyed
+        exact_surveyed = survey_results[
+            survey_results["Address ID"].isin(region_assets["Address ID"])
+        ]
 
         region_assets = region_assets.merge(
-            region_surveyed,
-            on="Archetype ID",
+            exact_surveyed[["Address ID", "Current EPC Band"]],
+            on="Address ID",
             how="left"
         )
 
@@ -1701,22 +1698,95 @@ def propsed_wave_3_sample():
         region_assets["Confidence Tier"] = None
         region_assets["Confidence Tier"] = np.where(
             region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]),
-            "1", region_assets["Confidence Tier"]
+            "1 - property was surveyed", region_assets["Confidence Tier"]
         )
-        # TODO: Turn into a function
-        missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
 
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band"].isin(["C", "B", "A"]),
+            "6 - property was surveyed", region_assets["Confidence Tier"]
+        )
+
+        archetypes = region_assets[
+            pd.isnull(region_assets["Confidence Tier"])
+        ]["Archetype ID"].unique()
+        # We get the properties that have been surveyed
         region_surveyed = survey_results[
-            survey_results["Archetype ID"].isin(missed_archetypes)
-        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+            survey_results["Archetype ID"].isin(archetypes) &
+            (survey_results["Postal Region"] == region)
+            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
         if region_surveyed["Archetype ID"].duplicated().sum():
-            raise NotImplementedError("Fix me 2")
+            # Take the duplicated archetypes
+            duplicated_archetypes = region_surveyed[
+                region_surveyed["Archetype ID"].duplicated()
+            ]["Archetype ID"].unique()
+            duplicated_archetypes = region_surveyed[
+                region_surveyed["Archetype ID"].isin(duplicated_archetypes)
+            ]
+
+            # We need to select which one is the most relevant to these properties
+            survey_data = survey_results_with_original_features[
+                survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values)
+            ]
+
+            raise NotImplementedError("Fix me")
 
         region_assets = region_assets.merge(
             region_surveyed,
             on="Archetype ID",
             how="left",
+            suffixes=("", "_method1")
+        )
+
+        # Label the tier 1 properties
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
+            pd.isnull(region_assets["Confidence Tier"]),
+            "1 - Archetype surveyed", region_assets["Confidence Tier"]
+        )
+        region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
+        # TODO: Turn into a function
+        missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
+
+        archetype_surveyed = survey_results[
+            survey_results["Archetype ID"].isin(missed_archetypes)
+        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+
+        if archetype_surveyed["Archetype ID"].duplicated().sum():
+            # We need to select which one is the most relevant to these properties
+            duplicated_archetypes = archetype_surveyed[
+                archetype_surveyed["Archetype ID"].duplicated()
+            ]["Archetype ID"].unique()
+
+            survey_data = survey_results_with_original_features[
+                survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes)
+            ]
+
+            homes_with_these_archetypes = region_assets[
+                region_assets["Archetype ID"].isin(duplicated_archetypes)
+            ]
+
+            for _, home in homes_with_these_archetypes.iterrows():
+                first_filter = survey_data[
+                    (survey_data["Postal Region"] == home["Postal Region"]) &
+                    (survey_data["Property Type"] == home["Property Type"]) &
+                    (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
+                    ]
+
+                if not first_filter.empty:
+                    NotImplementedError("Fix me 0")
+
+                second_filter = survey_data[
+                    (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) &
+                    (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
+                    ]
+
+            raise NotImplementedError("Fix me 2")
+
+        region_assets = region_assets.merge(
+            archetype_surveyed,
+            on="Archetype ID",
+            how="left",
             suffixes=("", "_method2")
         )