From 0796b384fb3aa8bf6cb3689c21cd5c5ac5acfc87 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 15 Nov 2024 18:42:08 +0000
Subject: [PATCH 01/31] added non-invasive rec

---
 .idea/Model.iml                         |  2 +-
 .idea/misc.xml                          |  2 +-
 etl/customers/remote_assessments/app.py | 27 +++++++++++++++++--------
 recommendations/FloorRecommendations.py | 11 +++++++++-
 4 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/.idea/Model.iml b/.idea/Model.iml
index 762580d9..df6c4faa 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index c916a158..50cad4ca 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index a0d01f7d..33015d87 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from utils.s3 import save_csv_to_s3
 
-PORTFOLIO_ID = 111
+PORTFOLIO_ID = 120
 USER_ID = 8
 
 
@@ -13,9 +13,9 @@ def app():
 
     asset_list = [
         {
-            "uprn": 100050770761,
-            "address": "12 Sheardown Street",
-            "postcode": "DN4 0BH"
+            "uprn": 100030334057,
+            "address": "5, Lynton Street",
+            "postcode": "DE22 3RW"
         }
     ]
     asset_list = pd.DataFrame(asset_list)
@@ -30,11 +30,22 @@ def app():
 
     non_invasive_recommendations = [
         {
-            "uprn": 100050770761,
+            "uprn": 100030334057,
             "recommendations": [
                 {
-                    "type": "extension_cavity_wall_insulation",
+                    "type": "internal_wall_insulation",
+                    "sap_points": 9,
+                    "survey": True
+                },
+                {
+                    "type": "external_wall_insulation",
+                    "sap_points": 9,
+                    "survey": True
+                },
+                {
+                    "type": "suspended_floor_insulation",
                     "sap_points": 2,
+                    "survey": True
                 }
             ]
         }
@@ -49,8 +60,8 @@ def app():
 
     valuation_data = [
         {
-            "uprn": 100050770761,
-            "value": 67_000
+            "uprn": 100030334057,
+            "value": 133_000
         }
     ]
     # Store valuation data to s3
diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py
index 25741e7a..ed00bbe9 100644
--- a/recommendations/FloorRecommendations.py
+++ b/recommendations/FloorRecommendations.py
@@ -172,6 +172,11 @@ class FloorRecommendations(Definitions):
 
         insulation_materials = pd.DataFrame(insulation_materials)
 
+        non_invasive_recs = next(
+            (r for r in self.property.non_invasive_recommendations if
+             r["type"] == insulation_materials["type"].values[0]), {}
+        )
+
         lowest_selected_u_value = None
         for _, insulation_material_group in insulation_materials.groupby("description"):
 
@@ -217,6 +222,9 @@ class FloorRecommendations(Definitions):
                     else:
                         raise NotImplementedError("Implement me!")
 
+                    sap_points = non_invasive_recs.get("sap_points", None)
+                    survey = non_invasive_recs.get("survey", False)
+
                     floor_ending_config = FloorAttributes(new_description).process()
                     floor_simulation_config = check_simulation_difference(
                         new_config=floor_ending_config, old_config=self.property.floor, prefix="floor_"
@@ -245,7 +253,8 @@ class FloorRecommendations(Definitions):
                             "description": self._make_floor_description(material),
                             "starting_u_value": u_value,
                             "new_u_value": new_u_value,
-                            "sap_points": None,
+                            "sap_points": sap_points,
+                            "survey": survey,
                             "already_installed": already_installed,
                             "simulation_config": simulation_config,
                             "description_simulation": {

From 2b22a6012fc11b9e94cd430d0b4ae8426293ef9e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 15 Nov 2024 21:17:37 +0000
Subject: [PATCH 02/31] remote assessment complete

---
 recommendations/HotwaterRecommendations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py
index aed1a5e5..b86329e4 100644
--- a/recommendations/HotwaterRecommendations.py
+++ b/recommendations/HotwaterRecommendations.py
@@ -66,7 +66,7 @@ class HotwaterRecommendations:
             (self.property.hotwater["heater_type"] in ["electric immersion"]) &
             (self.property.data["hot-water-energy-eff"] == "Very Poor") &
             (self.property.hotwater["no_system_present"] is None) &
-            len(has_tank_recommendation) == 0
+            (len(has_tank_recommendation) == 0)
         ):
             self.recommend_tank_insulation(phase=phase)
             return

From 31c5935577d6723360841f3ddb2803f82a6b6123 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 15 Nov 2024 21:58:51 +0000
Subject: [PATCH 03/31] creating route march planning app

---
 .idea/Model.iml                            |   2 +-
 .idea/misc.xml                             |   2 +-
 etl/find_my_epc/RetrieveFindMyEpc.py       |  25 +-
 etl/route_march_data_pull/app.py           | 300 +++++++++++++++++++++
 etl/route_march_data_pull/requirements.txt |   0
 5 files changed, 326 insertions(+), 3 deletions(-)
 create mode 100644 etl/route_march_data_pull/app.py
 create mode 100644 etl/route_march_data_pull/requirements.txt

diff --git a/.idea/Model.iml b/.idea/Model.iml
index df6c4faa..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 50cad4ca..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
index cd76dae4..913a04b8 100644
--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -26,6 +26,20 @@ class RetrieveFindMyEpc:
 
         self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()
 
+    @staticmethod
+    def extract_low_carbon_sources(soup):
+        # Find the section header
+        section_header = soup.find("h3", string="Low and zero carbon energy sources")
+        if not section_header:
+            return {}
+
+        # Locate the list following the header
+        energy_list = section_header.find_next("ul")
+
+        # Extract the list items
+        sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")}
+        return sources
+
     def retrieve_newest_find_my_epc_data(self, sap_2012_date=None):
         """
         For a post code and address, we pull out all the required data from the find my epc website
@@ -191,6 +205,9 @@ class RetrieveFindMyEpc:
         # Finally, we format the recommendations
         recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date)
 
+        # 4) Low and zero carbon energy sources
+        low_carbon_energy_sources = self.extract_low_carbon_sources(address_res)
+
         resulting_data = {
             'epc_certificate': epc_certificate,
             'current_epc_rating': current_rating.split(' ')[-6],
@@ -200,7 +217,8 @@ class RetrieveFindMyEpc:
             "heating_text": heating_text,
             "hot_water_text": hot_water_text,
             "recommendations": recommendations,
-            **assessment_data
+            **assessment_data,
+            **low_carbon_energy_sources
         }
 
         return resulting_data
@@ -246,6 +264,11 @@ class RetrieveFindMyEpc:
             ],
             "Band A condensing boiler": ["boiler_upgrade"],
             "Double glazing": ["double_glazing"],
+            "Flue gas heat recovery device in conjunction with boiler": ["flue_gas_heat_recovery"],
+            "Wind turbine": ["wind_turbine"],
+            "Loft insulation": ["loft_insulation"],
+            "Solar photovoltaic (PV) panels": ["solar_pv"],
+            "Party wall insulation": ["party_wall_insulation"],
         }
 
         survey = True
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
new file mode 100644
index 00000000..060897f8
--- /dev/null
+++ b/etl/route_march_data_pull/app.py
@@ -0,0 +1,300 @@
+import os
+import time
+
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+from recommendations.recommendation_utils import (
+    estimate_perimeter,
+    estimate_external_wall_area,
+    estimate_number_of_floors
+)
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
+    epc_data = []
+    errors = []
+    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+        postcode = home[postcode_column]
+        house_number = home[address1_column]
+        full_address = home[fulladdress_column]
+
+        searcher = SearchEpc(
+            address1=str(house_number),
+            postcode=postcode,
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key="",
+            property_type=None,
+            fast=True,
+            full_address=full_address,
+            max_retries=5
+        )
+        # Force the skipping of estimating the EPC
+        searcher.ordnance_survey_client.property_type = None
+        searcher.ordnance_survey_client.built_form = None
+
+        searcher.find_property(skip_os=True)
+        if searcher.newest_epc is None:
+            continue
+
+        # Look for EPC recommendatons
+        try:
+            property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+        except:
+            property_recommendations = {"rows": []}
+
+        # Retrieve data from FindMyEPC
+        find_epc_searcher = RetrieveFindMyEpc(
+            address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
+        )
+        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+        time.sleep(np.random.uniform(0.1, 1))
+        try:
+            postcode = home[postcode_column]
+            house_number = home[address1_column]
+            full_address = home[fulladdress_column]
+
+            searcher = SearchEpc(
+                address1=str(house_number),
+                postcode=postcode,
+                auth_token=EPC_AUTH_TOKEN,
+                os_api_key="",
+                property_type=None,
+                fast=True,
+                full_address=full_address,
+                max_retries=5
+            )
+            # Force the skipping of estimating the EPC
+            searcher.ordnance_survey_client.property_type = None
+            searcher.ordnance_survey_client.built_form = None
+
+            searcher.find_property(skip_os=True)
+            if searcher.newest_epc is None:
+                continue
+
+            # Look for EPC recommendatons
+            try:
+                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+            except:
+                property_recommendations = {"rows": []}
+
+            # Retrieve data from FindMyEPC
+            find_epc_searcher = RetrieveFindMyEpc(
+                address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
+            )
+            find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+            time.sleep(np.random.uniform(0.1, 1))
+
+            epc = {
+                "row_id": home["row_id"],
+                **searcher.newest_epc.copy(),
+                "recommendations": property_recommendations["rows"],
+                "find_my_epc_data": find_epc_data,
+            }
+
+            epc_data.append(epc)
+        except Exception as e:
+            errors.append(home["row_id"])
+            time.sleep(5)
+
+    return epc_data, errors
+
+
+def extract_address1(asset_list, full_address_col, method="first_two_words"):
+    if method == "first_two_words":
+        asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+        return asset_list
+
+    raise ValueError(f"Method {method} not recognized")
+
+
+def app():
+    """
+    This app is EPC pulling data for some properties owned by Livewest
+
+    Data request contents:
+    Date of last EPC
+    Reason for EPC
+    SAP score on register
+    Property Type
+    Property Area
+    Property Age
+    Any Dimensions (HLP,PW,RH)
+    Property Wall Construction
+    Heating Type
+    Secondary Heating
+    Loft Insulation Depth
+
+    Additional if possible:
+    Heat loss calculations
+    EPC recommendations
+    Property UPRN
+
+    """
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/"
+    DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx"
+    POSTCODE_COLUMN = "Postcode"
+    FULLADDRESS_COLUMN = "Address"
+    ADDRESS1_COLUMN = None
+    ADDRESS1_METHOD = "first_two_words"
+
+    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0)
+    asset_list["row_id"] = asset_list.index
+
+    # We clean up portential non-breaking spaces, and double spaces
+    for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
+        asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
+        asset_list[col] = asset_list[col].str.replace('  ', ' ', regex=False)
+
+    if ADDRESS1_COLUMN is None:
+        ADDRESS1_COLUMN = "address1_extracted"
+        asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD)
+
+    epc_data, errors = get_data(
+        asset_list=asset_list,
+        fulladdress_column=FULLADDRESS_COLUMN,
+        address1_column=ADDRESS1_COLUMN,
+        postcode_column=POSTCODE_COLUMN
+    )
+
+    # We now retrieve any failed properties
+    asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
+    epc_data_failed, _ = get_data(
+        asset_list=asset_list_failed,
+        fulladdress_column=FULLADDRESS_COLUMN,
+        address1_column=ADDRESS1_COLUMN,
+        postcode_column=POSTCODE_COLUMN
+    )
+
+    # Append the failed data to the main data
+    epc_data.extend(epc_data_failed)
+
+    epc_df = pd.DataFrame(epc_data)
+
+    # We expand out the recommendations
+    recommendations_df = epc_df[["row_id", "recommendations"]]
+
+    unique_recommendations = set()
+    for _, row in recommendations_df.iterrows():
+        unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+    columns = ["row_id"] + list(unique_recommendations)
+    transformed_data = []
+    for _, row in recommendations_df.iterrows():
+        # Initialize a dictionary for this row with False for all recommendations
+        row_data = {col: False for col in columns}
+        row_data["row_id"] = row["row_id"]
+
+        # Set True for each recommendation present in this row
+        for rec in row["recommendations"]:
+            recommendation_text = rec["improvement-summary-text"]
+            row_data[recommendation_text] = True
+
+        # Append the row data to transformed_data
+        transformed_data.append(row_data)
+
+    transformed_df = pd.DataFrame(transformed_data)
+    # Drop the column that is ""
+    transformed_df = transformed_df.drop(columns=[""])
+
+    # Retrieve just the data we need
+    epc_df = epc_df[
+        [
+            "row_id",
+            "uprn",
+            "property-type",
+            "built-form",
+            "inspection-date",
+            "current-energy-rating",
+            "current-energy-efficiency",
+            "roof-description",
+            "walls-description",
+            "transaction-type",
+            # New fields needed
+            "secondheat-description",
+            "total-floor-area",
+            "construction-age-band",
+            "floor-height",
+            "number-habitable-rooms",
+            "mainheat-description",
+            #
+            "energy-consumption-current",  # kwh/m2
+        ]
+    ]
+
+    asset_list = asset_list.merge(
+        epc_df,
+        how="left",
+        on="row_id"
+    ).merge(
+        transformed_df,
+        how="left",
+        on="row_id"
+    )
+
+    asset_list = asset_list.drop(columns=["row_id"])
+
+    # Rename the columns
+    asset_list = asset_list.rename(columns={
+        "inspection-date": "Date of last EPC",
+        "current-energy-efficiency": "SAP score on register",
+        "current-energy-rating": "EPC rating on register",
+        "property-type": "Property Type",
+        "built-form": "Archetype",
+        "total-floor-area": "Property Floor Area",
+        "construction-age-band": "Property Age Band",
+        "floor-height": "Property Floor Height",
+        "number-habitable-rooms": "Number of Habitable Rooms",
+        "walls-description": "Wall Construction",
+        "roof-description": "Roof Construction",
+        "mainheat-description": "Heating Type",
+        "secondheat-description": "Secondary Heating",
+        "transaction-type": "Reason for last EPC",
+        "energy-consumption-current": "Heat Demand (kWh/m2)"
+    })
+
+    asset_list["Estimated Number of Floors"] = asset_list.apply(
+        lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
+            x["Property Type"]) else None, axis=1
+    )
+
+    asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
+    # Replace "" value with None
+    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
+    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
+
+    asset_list["Estimated Perimeter (m)"] = asset_list.apply(
+        lambda x: estimate_perimeter(
+            floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
+            num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
+        ), axis=1
+    )
+
+    asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
+        lambda x: estimate_external_wall_area(
+            num_floors=x["Estimated Number of Floors"],
+            floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
+            perimeter=x["Estimated Perimeter (m)"],
+            built_form=x["Archetype"]
+        ),
+        axis=1
+    )
+
+    asset_list["Roof Insulation Thickness"] = asset_list.apply(
+        lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
+            x["Roof Construction"]) else None,
+        axis=1
+    )
+
+    # Store as an excel
+    filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx"
+    asset_list.to_excel(filename, index=False)
diff --git a/etl/route_march_data_pull/requirements.txt b/etl/route_march_data_pull/requirements.txt
new file mode 100644
index 00000000..e69de29b

From dc1cf6d6045c5f94e2826f6ff20010e05043d1ff Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 16 Nov 2024 15:49:08 +0000
Subject: [PATCH 04/31] working on stonewater matching algorithm

---
 .../southend/epc_data_pull_2024_11_14.py      |   4 -
 .../stonewater/Wave 3 Preparation.py          | 133 +++++++++++++++++-
 etl/route_march_data_pull/app.py              |  43 +++++-
 3 files changed, 171 insertions(+), 9 deletions(-)

diff --git a/etl/customers/southend/epc_data_pull_2024_11_14.py b/etl/customers/southend/epc_data_pull_2024_11_14.py
index 14cd73be..11ddcc6f 100644
--- a/etl/customers/southend/epc_data_pull_2024_11_14.py
+++ b/etl/customers/southend/epc_data_pull_2024_11_14.py
@@ -229,7 +229,3 @@ def app():
     filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov "
                 "2024.xlsx")
     asset_list.to_excel(filename, index=False)
-
-    asset_list["% of the Roof with PV"].value_counts()
-
-    asset_list[asset_list["% of the Roof with PV"] == "50.0"][["Address", "Postcode"]]
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index a5bbff7b..019c51c9 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -117,7 +117,7 @@ def extract_summary_report(pdf_path):
     - Fuel Bill
     - Address
     """
-    
+
     data = {
         "Address": None,
         "Postcode": None,
@@ -1618,5 +1618,136 @@ def append_stonewater_id():
         index=False
     )
 
+
+def propsed_wave_3_sample():
+    """
+    Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties
+    such that most of the properties within a geographical area are treatable within the bid.
+    Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the
+    properties within that geographical area to be included within the bid
+    :return:
+    """
+
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+        "- Archetyped V3.1.xlsx",
+        header=4
+    )
+    # Clean address ids
+    asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
+    asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
+    asset_list["Address ID"] = asset_list["Address ID"].astype(int)
+
+    # Create the postal region, taking the first part of the postcode
+    asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0]
+    unique_postal_regions = asset_list["Postal Region"].unique()
+
+    # Keep just the columns we need
+    asset_list = asset_list[
+        ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
+         "Heating"]
+    ]
+
+    survey_results = pd.read_excel(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"),
+        header=13,
+        sheet_name="Modelled Packages"
+    )
+
+    # TOOD: We probably want the actual surveyed wall, roof, heating type
+    survey_results = survey_results[
+        ["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"]
+    ]
+    survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
+
+    survey_results_with_original_features = survey_results.merge(
+        asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
+        on="Address ID",
+        how="left"
+    )
+
+    if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
+        raise ValueError("Something went wrong")
+
+    # Tier definitions
+    # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
+    # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
+    #
+
+    for region in unique_postal_regions:
+        # Take all of the properties in that region
+        region_assets = asset_list[asset_list["Postal Region"] == region].copy()
+        archetypes = region_assets["Archetype ID"].unique()
+        # We get the properties that have been surveyed
+        region_surveyed = survey_results[
+            survey_results["Archetype ID"].isin(archetypes) &
+            (survey_results["Postal Region"] == region)
+            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+
+        if region_surveyed["Archetype ID"].duplicated().sum():
+            raise NotImplementedError("Fix me")
+
+        region_assets = region_assets.merge(
+            region_surveyed,
+            on="Archetype ID",
+            how="left"
+        )
+
+        # Label the tier 1 properties
+        region_assets["Confidence Tier"] = None
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]),
+            "1", region_assets["Confidence Tier"]
+        )
+        # TODO: Turn into a function
+        missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
+
+        region_surveyed = survey_results[
+            survey_results["Archetype ID"].isin(missed_archetypes)
+        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+
+        if region_surveyed["Archetype ID"].duplicated().sum():
+            raise NotImplementedError("Fix me 2")
+
+        region_assets = region_assets.merge(
+            region_surveyed,
+            on="Archetype ID",
+            how="left",
+            suffixes=("", "_method2")
+        )
+
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]),
+            "2 - same archetype", region_assets["Confidence Tier"]
+        )
+
+        region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna(
+            region_assets["Current EPC Band_method2"])
+
+        region_assets = region_assets.drop(columns=["Current EPC Band_method2"])
+
+        missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
+
+        # This means that this archetype was never surveyed and so we need to find a sufficiently similar property
+        for a_id in missed_addressids:
+            property = asset_list[asset_list["Address ID"] == a_id].squeeze()
+
+            surveyed_same_postcode = survey_results_with_original_features[
+                (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
+                (survey_results_with_original_features["Property Type"] == property["Property Type"])
+                ]
+
+            surveyed_same_region = survey_results_with_original_features[
+                (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+                (survey_results_with_original_features["Property Type"] == property["Property Type"])
+                ]
+
+        same_postcode = survey_results[
+            survey_results["Archetype ID"].isin(missed_archetypes) &
+            (survey_results["Postal Region"] == region)
+            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+
+        pd.isnull(region_assets["Current EPC Band"]).sum()
+
 # if __name__ == "__main__":
 #     main()
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 060897f8..f24c5bb2 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -206,6 +206,14 @@ def app():
     # Drop the column that is ""
     transformed_df = transformed_df.drop(columns=[""])
 
+    # Get the find my epc data
+    find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
+        pd.json_normalize(epc_df["find_my_epc_data"])
+    )
+    # We check if we get the solar pv column:
+    if "Solar photovoltaics" not in find_my_epc_data.columns:
+        find_my_epc_data["Solar photovoltaics"] = False
+
     # Retrieve just the data we need
     epc_df = epc_df[
         [
@@ -228,6 +236,7 @@ def app():
             "mainheat-description",
             #
             "energy-consumption-current",  # kwh/m2
+            "photo-supply",
         ]
     ]
 
@@ -236,12 +245,25 @@ def app():
         how="left",
         on="row_id"
     ).merge(
-        transformed_df,
+        find_my_epc_data[
+            [
+                "row_id", "heating_text", "hot_water_text", 'Assessor’s name',
+                "Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
+                "Assessor’s ID", "Solar photovoltaics"
+            ]
+        ].rename(
+            columns={
+                "Solar photovoltaics": "Has Solar PV",
+                "heating_text": "Heating Estimated kWh",
+                "hot_water_text": "Hot Water Estimated kWh",
+            }
+        ),
         how="left",
         on="row_id"
     )
 
-    asset_list = asset_list.drop(columns=["row_id"])
+    asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
+    asset_list = asset_list.drop(columns=["photo-supply"])
 
     # Rename the columns
     asset_list = asset_list.rename(columns={
@@ -259,7 +281,7 @@ def app():
         "mainheat-description": "Heating Type",
         "secondheat-description": "Secondary Heating",
         "transaction-type": "Reason for last EPC",
-        "energy-consumption-current": "Heat Demand (kWh/m2)"
+        "energy-consumption-current": "Heat Demand (kWh/m2)",
     })
 
     asset_list["Estimated Number of Floors"] = asset_list.apply(
@@ -295,6 +317,19 @@ def app():
         axis=1
     )
 
+    # For all of the columns in transformed_df, prefix with "Recommendation: "
+    for col in transformed_df.columns:
+        if col == "row_id":
+            continue
+        transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"})
+
+    asset_list = asset_list.merge(
+        transformed_df,
+        how="left",
+        on="row_id"
+    )
+    asset_list = asset_list.drop(columns=["row_id"])
+
     # Store as an excel
-    filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx"
+    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
     asset_list.to_excel(filename, index=False)

From c13c84b98cbab169300306adeba534145496251c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 16 Nov 2024 15:55:19 +0000
Subject: [PATCH 05/31] First region implemented

---
 .../stonewater/Wave 3 Preparation.py          | 58 +++++++++++++++----
 1 file changed, 47 insertions(+), 11 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 019c51c9..7c104f97 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1729,25 +1729,61 @@ def propsed_wave_3_sample():
         missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
 
         # This means that this archetype was never surveyed and so we need to find a sufficiently similar property
+        final_missed_matches = []
         for a_id in missed_addressids:
             property = asset_list[asset_list["Address ID"] == a_id].squeeze()
 
-            surveyed_same_postcode = survey_results_with_original_features[
+            # TODO: This is quite strict for the moment - we might want to relax this by creating reduced versions
+            #       of the wall, roof and heating features, splitting them on the colons and taking the first part
+            surveyed_similar = survey_results_with_original_features[
                 (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
-                (survey_results_with_original_features["Property Type"] == property["Property Type"])
+                (survey_results_with_original_features["Property Type"] == property["Property Type"]) &
+                (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) &
+                (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) &
+                (survey_results_with_original_features["Heating"] == property["Heating"])
                 ]
+            if surveyed_similar.empty:
+                surveyed_similar = survey_results_with_original_features[
+                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+                    (survey_results_with_original_features["Property Type"] == property["Property Type"]) &
+                    (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) &
+                    (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) &
+                    (survey_results_with_original_features["Heating"] == property["Heating"])
+                    ]
 
-            surveyed_same_region = survey_results_with_original_features[
-                (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-                (survey_results_with_original_features["Property Type"] == property["Property Type"])
-                ]
+            if surveyed_similar.empty:
+                final_missed_matches.append(
+                    {
+                        "Address ID": a_id,
+                        "Confidence Tier": "5 - no similar property, needs survey to confirm"
+                    }
+                )
+                continue
 
-        same_postcode = survey_results[
-            survey_results["Archetype ID"].isin(missed_archetypes) &
-            (survey_results["Postal Region"] == region)
-            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+            raise NotImplementedError("Implement me")
 
-        pd.isnull(region_assets["Current EPC Band"]).sum()
+        final_missed_matches = pd.DataFrame(final_missed_matches)
+
+        region_assets = region_assets.merge(
+            final_missed_matches,
+            on="Address ID",
+            how="left",
+            suffixes=("", "_method3")
+        )
+
+        region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna(
+            region_assets["Confidence Tier_method3"]
+        )
+
+        region_assets = region_assets.drop(columns=["Confidence Tier_method3"])
+
+        region_assets["Current EPC Band"] = np.where(
+            region_assets["Confidence Tier"] == "5 - no similar property, needs survey to confirm",
+            "Unknown", region_assets["Current EPC Band"]
+        )
+
+        if pd.isnull(region_assets["Current EPC Band"]).sum():
+            raise Exception("Something went wrong")
 
 # if __name__ == "__main__":
 #     main()

From 8f9b8f08862cbadcbd0daaa29219cd0980606b3f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 16 Nov 2024 16:30:23 +0000
Subject: [PATCH 06/31] working on algorithm

---
 etl/customers/stonewater/Wave 3 Preparation.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 7c104f97..008fd3bc 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1633,6 +1633,9 @@ def propsed_wave_3_sample():
         "- Archetyped V3.1.xlsx",
         header=4
     )
+
+    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater
+    asset_list = asset_list[asset_list["Archetype ID"] == "NOT PRIORITY POSTCODE"]
     # Clean address ids
     asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
     asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
@@ -1674,6 +1677,7 @@ def propsed_wave_3_sample():
     # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
     #
 
+    results = []
     for region in unique_postal_regions:
         # Take all of the properties in that region
         region_assets = asset_list[asset_list["Postal Region"] == region].copy()
@@ -1722,10 +1726,17 @@ def propsed_wave_3_sample():
         )
 
         region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna(
-            region_assets["Current EPC Band_method2"])
+            region_assets["Current EPC Band_method2"].astype(str),
+        )
 
         region_assets = region_assets.drop(columns=["Current EPC Band_method2"])
 
+        # We label EPC C properties
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band"].isin(["C", "B", "A"]),
+            "6 - EPC C or above", region_assets["Confidence Tier"]
+        )
+
         missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
 
         # This means that this archetype was never surveyed and so we need to find a sufficiently similar property
@@ -1785,5 +1796,7 @@ def propsed_wave_3_sample():
         if pd.isnull(region_assets["Current EPC Band"]).sum():
             raise Exception("Something went wrong")
 
+        results.append(region_assets)
+
 # if __name__ == "__main__":
 #     main()

From 2158ab2cd50df7edcfc7e119b56237145f4f1dd1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 16 Nov 2024 16:33:43 +0000
Subject: [PATCH 07/31] debugging stoneater alg

---
 etl/customers/stonewater/Wave 3 Preparation.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 008fd3bc..ef7dd414 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1635,7 +1635,7 @@ def propsed_wave_3_sample():
     )
 
     # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater
-    asset_list = asset_list[asset_list["Archetype ID"] == "NOT PRIORITY POSTCODE"]
+    asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"]
     # Clean address ids
     asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
     asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
@@ -1678,7 +1678,7 @@ def propsed_wave_3_sample():
     #
 
     results = []
-    for region in unique_postal_regions:
+    for region in tqdm(unique_postal_regions):
         # Take all of the properties in that region
         region_assets = asset_list[asset_list["Postal Region"] == region].copy()
         archetypes = region_assets["Archetype ID"].unique()
@@ -1739,7 +1739,11 @@ def propsed_wave_3_sample():
 
         missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
 
-        # This means that this archetype was never surveyed and so we need to find a sufficiently similar property
+        if not missed_addressids:
+            results.append(region_assets)
+            continue
+
+            # This means that this archetype was never surveyed and so we need to find a sufficiently similar property
         final_missed_matches = []
         for a_id in missed_addressids:
             property = asset_list[asset_list["Address ID"] == a_id].squeeze()

From 4d021f0ba6a5894659275d8090e1f65be6ca68f6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 16 Nov 2024 17:12:55 +0000
Subject: [PATCH 08/31] working on stonewater alg

---
 .../stonewater/Wave 3 Preparation.py          | 102 +++++++++++++++---
 1 file changed, 86 insertions(+), 16 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index ef7dd414..40dfd38e 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3,6 +3,7 @@ import PyPDF2
 import re
 import pandas as pd
 import numpy as np
+from docutils.utils.math.tex2mathml_extern import blahtexml
 from tqdm import tqdm
 from collections import Counter
 
@@ -1681,19 +1682,15 @@ def propsed_wave_3_sample():
     for region in tqdm(unique_postal_regions):
         # Take all of the properties in that region
         region_assets = asset_list[asset_list["Postal Region"] == region].copy()
-        archetypes = region_assets["Archetype ID"].unique()
-        # We get the properties that have been surveyed
-        region_surveyed = survey_results[
-            survey_results["Archetype ID"].isin(archetypes) &
-            (survey_results["Postal Region"] == region)
-            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
-        if region_surveyed["Archetype ID"].duplicated().sum():
-            raise NotImplementedError("Fix me")
+        # We have a tier 1 match if the property itself was surveyed
+        exact_surveyed = survey_results[
+            survey_results["Address ID"].isin(region_assets["Address ID"])
+        ]
 
         region_assets = region_assets.merge(
-            region_surveyed,
-            on="Archetype ID",
+            exact_surveyed[["Address ID", "Current EPC Band"]],
+            on="Address ID",
             how="left"
         )
 
@@ -1701,22 +1698,95 @@ def propsed_wave_3_sample():
         region_assets["Confidence Tier"] = None
         region_assets["Confidence Tier"] = np.where(
             region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]),
-            "1", region_assets["Confidence Tier"]
+            "1 - property was surveyed", region_assets["Confidence Tier"]
         )
-        # TODO: Turn into a function
-        missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
 
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band"].isin(["C", "B", "A"]),
+            "6 - property was surveyed", region_assets["Confidence Tier"]
+        )
+
+        archetypes = region_assets[
+            pd.isnull(region_assets["Confidence Tier"])
+        ]["Archetype ID"].unique()
+        # We get the properties that have been surveyed
         region_surveyed = survey_results[
-            survey_results["Archetype ID"].isin(missed_archetypes)
-        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+            survey_results["Archetype ID"].isin(archetypes) &
+            (survey_results["Postal Region"] == region)
+            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
         if region_surveyed["Archetype ID"].duplicated().sum():
-            raise NotImplementedError("Fix me 2")
+            # Take the duplicated archetypes
+            duplicated_archetypes = region_surveyed[
+                region_surveyed["Archetype ID"].duplicated()
+            ]["Archetype ID"].unique()
+            duplicated_archetypes = region_surveyed[
+                region_surveyed["Archetype ID"].isin(duplicated_archetypes)
+            ]
+
+            # We need to select which one is the most relevant to these properties
+            survey_data = survey_results_with_original_features[
+                survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values)
+            ]
+
+            raise NotImplementedError("Fix me")
 
         region_assets = region_assets.merge(
             region_surveyed,
             on="Archetype ID",
             how="left",
+            suffixes=("", "_method1")
+        )
+
+        # Label the tier 1 properties
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
+            pd.isnull(region_assets["Confidence Tier"]),
+            "1 - Archetype surveyed", region_assets["Confidence Tier"]
+        )
+        region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
+        # TODO: Turn into a function
+        missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
+
+        archetype_surveyed = survey_results[
+            survey_results["Archetype ID"].isin(missed_archetypes)
+        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+
+        if archetype_surveyed["Archetype ID"].duplicated().sum():
+            # We need to select which one is the most relevant to these properties
+            duplicated_archetypes = archetype_surveyed[
+                archetype_surveyed["Archetype ID"].duplicated()
+            ]["Archetype ID"].unique()
+
+            survey_data = survey_results_with_original_features[
+                survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes)
+            ]
+
+            homes_with_these_archetypes = region_assets[
+                region_assets["Archetype ID"].isin(duplicated_archetypes)
+            ]
+
+            for _, home in homes_with_these_archetypes.iterrows():
+                first_filter = survey_data[
+                    (survey_data["Postal Region"] == home["Postal Region"]) &
+                    (survey_data["Property Type"] == home["Property Type"]) &
+                    (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
+                    ]
+
+                if not first_filter.empty:
+                    NotImplementedError("Fix me 0")
+
+                second_filter = survey_data[
+                    (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) &
+                    (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
+                    ]
+
+            raise NotImplementedError("Fix me 2")
+
+        region_assets = region_assets.merge(
+            archetype_surveyed,
+            on="Archetype ID",
+            how="left",
             suffixes=("", "_method2")
         )
 

From d00c291c17dacb545eef4b708047ec5c699baf18 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 15:16:54 +0000
Subject: [PATCH 09/31] debugging stonewater algorithm

---
 .../stonewater/Wave 3 Preparation.py          | 68 +++++++------------
 1 file changed, 25 insertions(+), 43 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 40dfd38e..5b1e2f91 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1716,20 +1716,11 @@ def propsed_wave_3_sample():
             ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
         if region_surveyed["Archetype ID"].duplicated().sum():
-            # Take the duplicated archetypes
-            duplicated_archetypes = region_surveyed[
-                region_surveyed["Archetype ID"].duplicated()
-            ]["Archetype ID"].unique()
-            duplicated_archetypes = region_surveyed[
-                region_surveyed["Archetype ID"].isin(duplicated_archetypes)
-            ]
-
-            # We need to select which one is the most relevant to these properties
-            survey_data = survey_results_with_original_features[
-                survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values)
-            ]
-
-            raise NotImplementedError("Fix me")
+            region_surveyed = survey_results[
+                survey_results["Archetype ID"].isin(archetypes) &
+                (survey_results["Postal Region"] == region)
+                ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
+            region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc)
 
         region_assets = region_assets.merge(
             region_surveyed,
@@ -1744,6 +1735,17 @@ def propsed_wave_3_sample():
             pd.isnull(region_assets["Confidence Tier"]),
             "1 - Archetype surveyed", region_assets["Confidence Tier"]
         )
+
+        region_assets["Current EPC Band"] = np.where(
+            pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]),
+            region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"]
+        )
+        # Handle EPC C
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
+            "6 - EPC C or above", region_assets["Confidence Tier"]
+        )
+
         region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
         # TODO: Turn into a function
         missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
@@ -1752,36 +1754,16 @@ def propsed_wave_3_sample():
             survey_results["Archetype ID"].isin(missed_archetypes)
         ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
+        # TODO - We could average the property?? And call it borderline, call out it was averaged!!!
+        #        We could also find the nearest property to it, with similar wall, roof, heating?
+        #        Can use long/lag to distance calc. We have this data from previous
+
         if archetype_surveyed["Archetype ID"].duplicated().sum():
-            # We need to select which one is the most relevant to these properties
-            duplicated_archetypes = archetype_surveyed[
-                archetype_surveyed["Archetype ID"].duplicated()
-            ]["Archetype ID"].unique()
-
-            survey_data = survey_results_with_original_features[
-                survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes)
-            ]
-
-            homes_with_these_archetypes = region_assets[
-                region_assets["Archetype ID"].isin(duplicated_archetypes)
-            ]
-
-            for _, home in homes_with_these_archetypes.iterrows():
-                first_filter = survey_data[
-                    (survey_data["Postal Region"] == home["Postal Region"]) &
-                    (survey_data["Property Type"] == home["Property Type"]) &
-                    (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
-                    ]
-
-                if not first_filter.empty:
-                    NotImplementedError("Fix me 0")
-
-                second_filter = survey_data[
-                    (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) &
-                    (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
-                    ]
-
-            raise NotImplementedError("Fix me 2")
+            archetype_surveyed = survey_results[
+                survey_results["Archetype ID"].isin(missed_archetypes)
+            ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
+            archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc)
+            archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"])
 
         region_assets = region_assets.merge(
             archetype_surveyed,

From 05cf7514783786261f7efe70eda5486712f8fb4c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 16:00:59 +0000
Subject: [PATCH 10/31] debuggin

---
 .../stonewater/Wave 3 Preparation.py          | 32 +++++++++++++------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 5b1e2f91..d2110de8 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1777,8 +1777,9 @@ def propsed_wave_3_sample():
             "2 - same archetype", region_assets["Confidence Tier"]
         )
 
-        region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna(
-            region_assets["Current EPC Band_method2"].astype(str),
+        region_assets["Current EPC Band"] = np.where(
+            pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]),
+            region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"]
         )
 
         region_assets = region_assets.drop(columns=["Current EPC Band_method2"])
@@ -1822,12 +1823,26 @@ def propsed_wave_3_sample():
                 final_missed_matches.append(
                     {
                         "Address ID": a_id,
-                        "Confidence Tier": "5 - no similar property, needs survey to confirm"
+                        "Confidence Tier": "5 - no similar property, needs survey to confirm",
+                        "Current EPC Band": "Unknown"
                     }
                 )
                 continue
+            # We take an average
+            expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            expected_epc = sap_to_epc(expected_sap)
+            if expected_epc in ["C", "B", "A"]:
+                tier = "6 - EPC C or above"
+            else:
+                tier = "3 - similar property"
 
-            raise NotImplementedError("Implement me")
+            final_missed_matches.append(
+                {
+                    "Address ID": a_id,
+                    "Confidence Tier": tier,
+                    "Current EPC Band": "Unknown"
+                }
+            )
 
         final_missed_matches = pd.DataFrame(final_missed_matches)
 
@@ -1841,14 +1856,13 @@ def propsed_wave_3_sample():
         region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna(
             region_assets["Confidence Tier_method3"]
         )
+        region_assets["Current EPC Band"] = np.where(
+            pd.isnull(region_assets["Current EPC Band"]),
+            region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"]
+        )
 
         region_assets = region_assets.drop(columns=["Confidence Tier_method3"])
 
-        region_assets["Current EPC Band"] = np.where(
-            region_assets["Confidence Tier"] == "5 - no similar property, needs survey to confirm",
-            "Unknown", region_assets["Current EPC Band"]
-        )
-
         if pd.isnull(region_assets["Current EPC Band"]).sum():
             raise Exception("Something went wrong")
 

From 7d209d5d8e07b4112bffcdcfc748d04cc299abe6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 16:28:43 +0000
Subject: [PATCH 11/31] creating loss and gain columns

---
 .../stonewater/Wave 3 Preparation.py          | 48 +++++++++++++++----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index d2110de8..b36ae756 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1703,7 +1703,7 @@ def propsed_wave_3_sample():
 
         region_assets["Confidence Tier"] = np.where(
             region_assets["Current EPC Band"].isin(["C", "B", "A"]),
-            "6 - property was surveyed", region_assets["Confidence Tier"]
+            "5 - property was surveyed", region_assets["Confidence Tier"]
         )
 
         archetypes = region_assets[
@@ -1721,6 +1721,7 @@ def propsed_wave_3_sample():
                 (survey_results["Postal Region"] == region)
                 ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
             region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc)
+            region_surveyed = region_surveyed.drop(columns=["Current SAP Rating"])
 
         region_assets = region_assets.merge(
             region_surveyed,
@@ -1743,7 +1744,7 @@ def propsed_wave_3_sample():
         # Handle EPC C
         region_assets["Confidence Tier"] = np.where(
             region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
-            "6 - EPC C or above", region_assets["Confidence Tier"]
+            "5 - EPC C or above", region_assets["Confidence Tier"]
         )
 
         region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
@@ -1773,7 +1774,8 @@ def propsed_wave_3_sample():
         )
 
         region_assets["Confidence Tier"] = np.where(
-            region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]),
+            region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
+                region_assets["Confidence Tier"]),
             "2 - same archetype", region_assets["Confidence Tier"]
         )
 
@@ -1786,8 +1788,8 @@ def propsed_wave_3_sample():
 
         # We label EPC C properties
         region_assets["Confidence Tier"] = np.where(
-            region_assets["Current EPC Band"].isin(["C", "B", "A"]),
-            "6 - EPC C or above", region_assets["Confidence Tier"]
+            region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
+            "5 - EPC C or above", region_assets["Confidence Tier"]
         )
 
         missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
@@ -1823,7 +1825,7 @@ def propsed_wave_3_sample():
                 final_missed_matches.append(
                     {
                         "Address ID": a_id,
-                        "Confidence Tier": "5 - no similar property, needs survey to confirm",
+                        "Confidence Tier": "4 - no similar property, needs survey to confirm",
                         "Current EPC Band": "Unknown"
                     }
                 )
@@ -1832,7 +1834,7 @@ def propsed_wave_3_sample():
             expected_sap = surveyed_similar["Current SAP Rating"].mean()
             expected_epc = sap_to_epc(expected_sap)
             if expected_epc in ["C", "B", "A"]:
-                tier = "6 - EPC C or above"
+                tier = "5 - EPC C or above"
             else:
                 tier = "3 - similar property"
 
@@ -1861,12 +1863,42 @@ def propsed_wave_3_sample():
             region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"]
         )
 
-        region_assets = region_assets.drop(columns=["Confidence Tier_method3"])
+        region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"])
 
         if pd.isnull(region_assets["Current EPC Band"]).sum():
             raise Exception("Something went wrong")
 
         results.append(region_assets)
 
+    results = pd.concat(results)
+
+    # Create a pivot table for counts of Confidence Tier by Postal Region
+    geographic_summary = results.pivot_table(
+        index='Postal Region',
+        columns='Confidence Tier',
+        aggfunc='size',
+        fill_value=0
+    ).reset_index()
+
+    # We create the gain and loss columns
+    # Gain is the sum of these columns:
+    # '1 - Archetype surveyed', '1 - property was surveyed',
+    #        '2 - same archetype', '3 - similar property',
+    # Loss is the sum of these columns:
+    # '4 - no similar property, needs survey to confirm',
+    # '5 - EPC C or above', '5 - property was surveyed'
+    geographic_summary["Gain"] = geographic_summary[
+        ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property']
+    ].sum(axis=1)
+
+    geographic_summary["Loss"] = geographic_summary[
+        ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed']
+    ].sum(axis=1)
+
+    geographic_summary.sum()
+
+    geographic_summary = geographic_summary.sort_values("Loss", ascending=True)
+    geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
+
 # if __name__ == "__main__":
 #     main()

From a01ff1d8dedaaf78e8ce95b21305a6f1a430ae3e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 16:45:10 +0000
Subject: [PATCH 12/31] tweaking postal region algorithm - may need to swap to
 postcode or street

---
 .../stonewater/Wave 3 Preparation.py          | 44 ++++++++++++++-----
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index b36ae756..20f771ec 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1803,22 +1803,43 @@ def propsed_wave_3_sample():
         for a_id in missed_addressids:
             property = asset_list[asset_list["Address ID"] == a_id].squeeze()
 
-            # TODO: This is quite strict for the moment - we might want to relax this by creating reduced versions
-            #       of the wall, roof and heating features, splitting them on the colons and taking the first part
+            if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
+                filter_property_types = ["House", "Bungalow"]
+            else:
+                filter_property_types = ["Flat"]
+
             surveyed_similar = survey_results_with_original_features[
                 (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
-                (survey_results_with_original_features["Property Type"] == property["Property Type"]) &
-                (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) &
-                (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) &
-                (survey_results_with_original_features["Heating"] == property["Heating"])
+                (
+                    survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+                        filter_property_types
+                    )
+                ) &
+                (
+                    survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+                    property["Wall Type"].split(":")[0]
+                ) &
+                (
+                    survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+                    property["Roof Type"].split(":")[0]
+                ) &
+                (
+                    survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+                    property["Heating"].split(":")[0]
+                )
                 ]
             if surveyed_similar.empty:
                 surveyed_similar = survey_results_with_original_features[
                     (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-                    (survey_results_with_original_features["Property Type"] == property["Property Type"]) &
-                    (survey_results_with_original_features["Wall Type"] == property["Wall Type"]) &
-                    (survey_results_with_original_features["Roof Type"] == property["Roof Type"]) &
-                    (survey_results_with_original_features["Heating"] == property["Heating"])
+                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+                        filter_property_types
+                    )) &
+                    (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+                     property["Wall Type"].split(":")[0]) &
+                    (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+                     property["Roof Type"].split(":")[0]) &
+                    (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+                     property["Heating"].split(":")[0])
                     ]
 
             if surveyed_similar.empty:
@@ -1842,7 +1863,7 @@ def propsed_wave_3_sample():
                 {
                     "Address ID": a_id,
                     "Confidence Tier": tier,
-                    "Current EPC Band": "Unknown"
+                    "Current EPC Band": expected_epc
                 }
             )
 
@@ -1899,6 +1920,7 @@ def propsed_wave_3_sample():
 
     geographic_summary = geographic_summary.sort_values("Loss", ascending=True)
     geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
+    geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()
 
 # if __name__ == "__main__":
 #     main()

From 7d63c164045c6855ea6cb13091788a2ed7db2afb Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 18:05:05 +0000
Subject: [PATCH 13/31] implemented linear programming to find maximal bid size

---
 .../stonewater/Wave 3 Preparation.py          | 71 ++++++++++++++++---
 .../requirements/requirements-wave-3-prep.txt |  1 +
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 20f771ec..c397f962 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3,9 +3,9 @@ import PyPDF2
 import re
 import pandas as pd
 import numpy as np
-from docutils.utils.math.tex2mathml_extern import blahtexml
 from tqdm import tqdm
 from collections import Counter
+from scipy.optimize import linprog
 
 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
 SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
@@ -1843,13 +1843,38 @@ def propsed_wave_3_sample():
                     ]
 
             if surveyed_similar.empty:
-                final_missed_matches.append(
-                    {
-                        "Address ID": a_id,
-                        "Confidence Tier": "4 - no similar property, needs survey to confirm",
-                        "Current EPC Band": "Unknown"
-                    }
-                )
+
+                # We get an average based on the postcode
+                surveyed_similar = survey_results_with_original_features[
+                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+                        filter_property_types
+                    ))
+                    ]
+                if surveyed_similar.empty:
+                    final_missed_matches.append(
+                        {
+                            "Address ID": a_id,
+                            "Confidence Tier": "4 - no similar property, needs survey to confirm",
+                            "Current EPC Band": "Unknown"
+                        }
+
+                    )
+                else:
+                    expected_sap = surveyed_similar["Current SAP Rating"].mean()
+                    expected_epc = sap_to_epc(expected_sap)
+                    if expected_epc in ["C", "B", "A"]:
+                        tier = "5 - EPC C or above"
+                    else:
+                        tier = "3 - similar property, relaxed conditions"
+
+                    final_missed_matches.append(
+                        {
+                            "Address ID": a_id,
+                            "Confidence Tier": tier,
+                            "Current EPC Band": expected_epc
+                        }
+                    )
                 continue
             # We take an average
             expected_sap = surveyed_similar["Current SAP Rating"].mean()
@@ -1922,5 +1947,35 @@ def propsed_wave_3_sample():
     geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
     geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()
 
+    geographic_summary[["Loss", "Gain"]].head()
+
+    loss = geographic_summary["Loss"].values
+    gain = geographic_summary["Gain"].values
+
+    # Define the coefficients for the objective function (negative because we maximize Gain)
+    c = -gain
+
+    # Define constraints
+    A = [loss]  # Only 1 constraint for now, total Loss
+    b = [250]  # Maximum total Loss allowed
+
+    # Bounds for each variable (select or not select each row, 0 <= x <= 1)
+    bounds = [(0, 1) for _ in gain]
+
+    # Solve the problem using linprog with HiGHS solver
+    result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs')
+    if not result.success:
+        raise Exception("Optimization failed")
+
+    selected_rows = result.x.round().astype(int)  # Rounded to 0 or 1
+    optimal_gain = -result.fun
+    print(optimal_gain)
+
+    # Select the rows that are selected
+    geographic_summary["Selected"] = selected_rows == 1
+    geographic_summary[geographic_summary["Selected"]].sum()
+    bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum()
+    print("Bid Size:", bid_size)
+
 # if __name__ == "__main__":
 #     main()
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
index 3ad5d2c1..09ba20bd 100644
--- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -7,4 +7,5 @@ epc-api-python==1.0.2
 usaddress==0.5.11
 fuzzywuzzy==0.18.0
 python-dotenv
+scipy
 

From eff80e637f73490c3f45d2ef0ffcc71a188e95cb Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 19:10:23 +0000
Subject: [PATCH 14/31] implementing distance weighting

---
 .../stonewater/Wave 3 Preparation.py          | 332 +++++++++++++-----
 1 file changed, 248 insertions(+), 84 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index c397f962..3b44d560 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1635,8 +1635,9 @@ def propsed_wave_3_sample():
         header=4
     )
 
-    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater
-    asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"]
+    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
+    # UPRN
+    asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])]
     # Clean address ids
     asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
     asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
@@ -1648,7 +1649,7 @@ def propsed_wave_3_sample():
 
     # Keep just the columns we need
     asset_list = asset_list[
-        ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
+        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
          "Heating"]
     ]
 
@@ -1665,7 +1666,7 @@ def propsed_wave_3_sample():
     survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
 
     survey_results_with_original_features = survey_results.merge(
-        asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
+        asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
         on="Address ID",
         how="left"
     )
@@ -1673,6 +1674,45 @@ def propsed_wave_3_sample():
     if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
         raise ValueError("Something went wrong")
 
+    # We get longitude & Latitude
+    from utils.s3 import read_pickle_from_s3
+    archetyping_spatial_features = read_pickle_from_s3(
+        bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+    )
+    archetyping_spatial_features = pd.concat(archetyping_spatial_features)
+    archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename(
+        columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"}
+    )
+    # Merge them onto both datasets
+    asset_list = asset_list.merge(
+        archetyping_spatial_features, how="left", on="UPRN"
+    )
+    if pd.isnull(asset_list["longitude"]).sum():
+        raise ValueError("Something went wrong")
+
+    survey_results_with_original_features = survey_results_with_original_features.merge(
+        archetyping_spatial_features, how="left", on="UPRN"
+    )
+    if pd.isnull(survey_results_with_original_features["longitude"]).sum():
+        raise ValueError("Something went wrong")
+
+    def haversine(lat1, lon1, lat2, lon2):
+        # Radius of Earth in meters
+        R = 6371000
+
+        # Convert degrees to radians
+        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
+
+        # Differences
+        dlat = lat2 - lat1
+        dlon = lon2 - lon1
+
+        # Haversine formula
+        a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
+        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
+        distance = R * c
+        return distance
+
     # Tier definitions
     # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
     # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
@@ -1716,6 +1756,7 @@ def propsed_wave_3_sample():
             ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
         if region_surveyed["Archetype ID"].duplicated().sum():
+            blah1
             region_surveyed = survey_results[
                 survey_results["Archetype ID"].isin(archetypes) &
                 (survey_results["Postal Region"] == region)
@@ -1755,23 +1796,46 @@ def propsed_wave_3_sample():
             survey_results["Archetype ID"].isin(missed_archetypes)
         ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
-        # TODO - We could average the property?? And call it borderline, call out it was averaged!!!
-        #        We could also find the nearest property to it, with similar wall, roof, heating?
-        #        Can use long/lag to distance calc. We have this data from previous
-
         if archetype_surveyed["Archetype ID"].duplicated().sum():
-            archetype_surveyed = survey_results[
-                survey_results["Archetype ID"].isin(missed_archetypes)
-            ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
-            archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc)
-            archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"])
 
-        region_assets = region_assets.merge(
-            archetype_surveyed,
-            on="Archetype ID",
-            how="left",
-            suffixes=("", "_method2")
-        )
+            archetype_surveyed = []
+            for arch_id in missed_archetypes:
+                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                    archetype_data = survey_results_with_original_features[
+                        survey_results["Archetype ID"] == arch_id
+                        ].copy()
+                    if archetype_data.empty:
+                        continue
+                    archetype_data["distance_meters"] = haversine(
+                        lat1=property.latitude, lon1=property.longitude,
+                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+                    )
+                    expected_sap = np.average(
+                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+                    )
+                    expected_epc = sap_to_epc(expected_sap)
+                    archetype_surveyed.append(
+                        {
+                            "Archetype ID": arch_id,
+                            "Address ID": property["Address ID"],
+                            "Current EPC Band": expected_epc
+                        }
+                    )
+            archetype_surveyed = pd.DataFrame(archetype_surveyed)
+            region_assets = region_assets.merge(
+                archetype_surveyed,
+                on=["Archetype ID", "Address ID"],
+                how="left",
+                suffixes=("", "_method2")
+            )
+        else:
+
+            region_assets = region_assets.merge(
+                archetype_surveyed,
+                on="Archetype ID",
+                how="left",
+                suffixes=("", "_method2")
+            )
 
         region_assets["Confidence Tier"] = np.where(
             region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
@@ -1792,6 +1856,16 @@ def propsed_wave_3_sample():
             "5 - EPC C or above", region_assets["Confidence Tier"]
         )
 
+        region_assets["Confidence Tier"] = np.where(
+            region_assets["Archetype ID"] == "EPC C OR ABOVE",
+            "5 - EPC C or above", region_assets["Confidence Tier"]
+        )
+
+        region_assets["Current EPC Band"] = np.where(
+            region_assets["Archetype ID"] == "EPC C OR ABOVE",
+            "C", region_assets["Current EPC Band"]
+        )
+
         missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
 
         if not missed_addressids:
@@ -1803,17 +1877,10 @@ def propsed_wave_3_sample():
         for a_id in missed_addressids:
             property = asset_list[asset_list["Address ID"] == a_id].squeeze()
 
-            if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
-                filter_property_types = ["House", "Bungalow"]
-            else:
-                filter_property_types = ["Flat"]
-
-            surveyed_similar = survey_results_with_original_features[
-                (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
+            surveyed = survey_results_with_original_features[
                 (
-                    survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    )
+                    survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
+                    property["Property Type"].split(":")[0]
                 ) &
                 (
                     survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
@@ -1827,62 +1894,38 @@ def propsed_wave_3_sample():
                     survey_results_with_original_features["Heating"].str.split(":").str[0] ==
                     property["Heating"].split(":")[0]
                 )
-                ]
-            if surveyed_similar.empty:
-                surveyed_similar = survey_results_with_original_features[
-                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    )) &
-                    (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
-                     property["Wall Type"].split(":")[0]) &
-                    (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
-                     property["Roof Type"].split(":")[0]) &
-                    (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
-                     property["Heating"].split(":")[0])
-                    ]
+                ].copy()
 
-            if surveyed_similar.empty:
+            if surveyed.empty:
+                blah3
 
-                # We get an average based on the postcode
-                surveyed_similar = survey_results_with_original_features[
-                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-                        filter_property_types
-                    ))
-                    ]
-                if surveyed_similar.empty:
-                    final_missed_matches.append(
-                        {
-                            "Address ID": a_id,
-                            "Confidence Tier": "4 - no similar property, needs survey to confirm",
-                            "Current EPC Band": "Unknown"
-                        }
+            # Calculate distance
+            surveyed["distance_meters"] = haversine(
+                lat1=property["latitude"], lon1=property["longitude"],
+                lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values
+            )
+            surveyed = surveyed.sort_values("distance_meters", ascending=True)
 
-                    )
-                else:
-                    expected_sap = surveyed_similar["Current SAP Rating"].mean()
-                    expected_epc = sap_to_epc(expected_sap)
-                    if expected_epc in ["C", "B", "A"]:
-                        tier = "5 - EPC C or above"
-                    else:
-                        tier = "3 - similar property, relaxed conditions"
+            # Check if we have a postcode match check if surveyed postcode is the same as the property postcode
+            if any(surveyed["Postcode"] == property["Postcode"]):
+                surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]]
 
-                    final_missed_matches.append(
-                        {
-                            "Address ID": a_id,
-                            "Confidence Tier": tier,
-                            "Current EPC Band": expected_epc
-                        }
-                    )
-                continue
-            # We take an average
-            expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            if any(surveyed["Postal Region"] == property["Postal Region"]):
+                surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
+
+            # Take the 5 nearest
+            surveyed_similar = surveyed_similar.head(5)
+
+            # perform a weighted mean of SAP rating - the closer the better
+            expected_sap = np.average(
+                surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1)
+            )
             expected_epc = sap_to_epc(expected_sap)
+
             if expected_epc in ["C", "B", "A"]:
                 tier = "5 - EPC C or above"
             else:
-                tier = "3 - similar property"
+                tier = "3 - similar property, weighted on distance"
 
             final_missed_matches.append(
                 {
@@ -1891,6 +1934,121 @@ def propsed_wave_3_sample():
                     "Current EPC Band": expected_epc
                 }
             )
+            continue
+
+            # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
+            #     filter_property_types = ["House", "Bungalow"]
+            # else:
+            #     filter_property_types = ["Flat"]
+            #
+            # surveyed_similar = survey_results_with_original_features[
+            #     (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
+            #     (
+            #         survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         )
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #         property["Wall Type"].split(":")[0]
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #         property["Roof Type"].split(":")[0]
+            #     ) &
+            #     (
+            #         survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #         property["Heating"].split(":")[0]
+            #     )
+            #     ]
+            # if surveyed_similar.empty:
+            #     surveyed_similar = survey_results_with_original_features[
+            #         (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+            #         (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         )) &
+            #         (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #          property["Wall Type"].split(":")[0]) &
+            #         (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #          property["Roof Type"].split(":")[0]) &
+            #         (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #          property["Heating"].split(":")[0])
+            #         ]
+            #
+            # if surveyed_similar.empty:
+            #
+            #     # We get an average based on the postcode
+            #     surveyed_similar = survey_results_with_original_features[
+            #         (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+            #         (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+            #             filter_property_types
+            #         ))
+            #         ]
+            #     if surveyed_similar.empty:
+            #         surveyed_similar_entire_population = survey_results_with_original_features[
+            #             (
+            #                 survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[
+            #                 "Property Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+            #                 property["Wall Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+            #                 property["Roof Type"].split(":")[0]
+            #             ) &
+            #             (
+            #                 survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+            #                 property["Heating"].split(":")[0]
+            #             )
+            #             ]
+            #
+            #         # We order them by distance on postcode
+            #
+            #         # Average
+            #         expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean()
+            #         expected_epc = sap_to_epc(expected_sap)
+            #
+            #         final_missed_matches.append(
+            #             {
+            #                 "Address ID": a_id,
+            #                 "Confidence Tier": "3 - similar property, all areas searched",
+            #                 "Current EPC Band": expected_epc
+            #             }
+            #
+            #         )
+            #     else:
+            #         expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            #         expected_epc = sap_to_epc(expected_sap)
+            #         if expected_epc in ["C", "B", "A"]:
+            #             tier = "5 - EPC C or above"
+            #         else:
+            #             tier = "3 - similar property, relaxed conditions"
+            #
+            #         final_missed_matches.append(
+            #             {
+            #                 "Address ID": a_id,
+            #                 "Confidence Tier": tier,
+            #                 "Current EPC Band": expected_epc
+            #             }
+            #         )
+            #     continue
+            # # We take an average
+            # expected_sap = surveyed_similar["Current SAP Rating"].mean()
+            # expected_epc = sap_to_epc(expected_sap)
+            # if expected_epc in ["C", "B", "A"]:
+            #     tier = "5 - EPC C or above"
+            # else:
+            #     tier = "3 - similar property"
+            #
+            # final_missed_matches.append(
+            #     {
+            #         "Address ID": a_id,
+            #         "Confidence Tier": tier,
+            #         "Current EPC Band": expected_epc
+            #     }
+            # )
 
         final_missed_matches = pd.DataFrame(final_missed_matches)
 
@@ -1928,27 +2086,33 @@ def propsed_wave_3_sample():
 
     # We create the gain and loss columns
     # Gain is the sum of these columns:
-    # '1 - Archetype surveyed', '1 - property was surveyed',
-    #        '2 - same archetype', '3 - similar property',
+    # '1 - Archetype surveyed',
+    # '1 - property was surveyed',
+    # '2 - same archetype',
+    # '3 - similar property',
+    # '3 - similar property, all areas searched',
+    # '3 - similar property, relaxed conditions'
+    #
     # Loss is the sum of these columns:
     # '4 - no similar property, needs survey to confirm',
     # '5 - EPC C or above', '5 - property was surveyed'
     geographic_summary["Gain"] = geographic_summary[
-        ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property']
+        [
+            '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property',
+            '3 - similar property, all areas searched', '3 - similar property, relaxed conditions'
+        ]
     ].sum(axis=1)
 
     geographic_summary["Loss"] = geographic_summary[
-        ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed']
+        ['5 - EPC C or above', '5 - property was surveyed']
     ].sum(axis=1)
 
-    geographic_summary.sum()
+    print(geographic_summary.sum())
 
     geographic_summary = geographic_summary.sort_values("Loss", ascending=True)
     geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
     geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()
 
-    geographic_summary[["Loss", "Gain"]].head()
-
     loss = geographic_summary["Loss"].values
     gain = geographic_summary["Gain"].values
 

From a630fe05c485aca2c5509748eecb5544ddc78dbe Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 19:46:17 +0000
Subject: [PATCH 15/31] fixing unhandled cases in matching algorithm

---
 .../stonewater/Wave 3 Preparation.py          | 92 ++++++++++++++++---
 1 file changed, 78 insertions(+), 14 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 3b44d560..460aa8ee 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1756,20 +1756,44 @@ def propsed_wave_3_sample():
             ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
         if region_surveyed["Archetype ID"].duplicated().sum():
-            blah1
-            region_surveyed = survey_results[
-                survey_results["Archetype ID"].isin(archetypes) &
-                (survey_results["Postal Region"] == region)
-                ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
-            region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc)
-            region_surveyed = region_surveyed.drop(columns=["Current SAP Rating"])
+            region_surveyed = []
+            for arch_id in archetypes:
+                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                    archetype_data = survey_results_with_original_features[
+                        survey_results["Archetype ID"] == arch_id
+                        ].copy()
+                    if archetype_data.empty:
+                        continue
+                    archetype_data["distance_meters"] = haversine(
+                        lat1=property.latitude, lon1=property.longitude,
+                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+                    )
+                    expected_sap = np.average(
+                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+                    )
+                    expected_epc = sap_to_epc(expected_sap)
+                    region_surveyed.append(
+                        {
+                            "Archetype ID": arch_id,
+                            "Address ID": property["Address ID"],
+                            "Current EPC Band": expected_epc
+                        }
+                    )
 
-        region_assets = region_assets.merge(
-            region_surveyed,
-            on="Archetype ID",
-            how="left",
-            suffixes=("", "_method1")
-        )
+            region_surveyed = pd.DataFrame(region_surveyed)
+            region_assets = region_assets.merge(
+                region_surveyed,
+                on=["Archetype ID", "Address ID"],
+                how="left",
+                suffixes=("", "_method1")
+            )
+        else:
+            region_assets = region_assets.merge(
+                region_surveyed,
+                on="Archetype ID",
+                how="left",
+                suffixes=("", "_method1")
+            )
 
         # Label the tier 1 properties
         region_assets["Confidence Tier"] = np.where(
@@ -1897,7 +1921,47 @@ def propsed_wave_3_sample():
                 ].copy()
 
             if surveyed.empty:
-                blah3
+                # In this case, we do one additional check where we filter on everything the same apart from heating,
+                # where we do a slightly more rough match
+                surveyed = survey_results_with_original_features[
+                    (
+                        survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
+                        property["Property Type"].split(":")[0]
+                    ) &
+                    (
+                        survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+                        property["Wall Type"].split(":")[0]
+                    ) &
+                    (
+                        survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+                        property["Roof Type"].split(":")[0]
+                    )
+                    ].copy()
+
+                if "Electric" in property["Heating"]:
+                    # Take other electric heating systems
+                    surveyed = surveyed[surveyed["Heating"].str.contains("Electric")]
+                elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)":
+                    # Take other community heating systems
+                    surveyed = surveyed[surveyed["Heating"].str.contains("Community")]
+                elif property["Heating"] == 'Heat Pump: (from database)':
+                    # Take other heat pumps
+                    surveyed = surveyed[surveyed["Heating"].str.contains("Heat Pump")]
+                elif property["Heating"] == "Solid fuel room heaters: Open fire in grate":
+                    # Take other properties with room heaters
+                    surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")]
+                else:
+                    raise Exception("Fix me")
+
+            if surveyed.empty:
+                final_missed_matches.append(
+                    {
+                        "Address ID": a_id,
+                        "Confidence Tier": "4 - no similar property, needs survey to confirm",
+                        "Current EPC Band": "Needs Survey"
+                    }
+                )
+                continue
 
             # Calculate distance
             surveyed["distance_meters"] = haversine(

From 1b38832e27abcbebe575f4be867a41e4ae772949 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 20:13:19 +0000
Subject: [PATCH 16/31] 2044 properties added

---
 .../stonewater/Wave 3 Preparation.py          | 148 ++++++++++++++----
 1 file changed, 117 insertions(+), 31 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 460aa8ee..6f98c9fd 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1938,6 +1938,27 @@ def propsed_wave_3_sample():
                     )
                     ].copy()
 
+                if surveyed.empty:
+                    if property["Property Type"].split(":")[0] in ["House", "Bungalow", "Maisonette"]:
+                        filter_property_types = ["House", "Bungalow", ]
+                    else:
+                        filter_property_types = ["Flat"]
+                    surveyed = survey_results_with_original_features[
+                        (
+                            survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+                                filter_property_types
+                            )
+                        ) &
+                        (
+                            survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+                            property["Wall Type"].split(":")[0]
+                        ) &
+                        (
+                            survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+                            property["Roof Type"].split(":")[0]
+                        )
+                        ].copy()
+
                 if "Electric" in property["Heating"]:
                     # Take other electric heating systems
                     surveyed = surveyed[surveyed["Heating"].str.contains("Electric")]
@@ -1950,6 +1971,9 @@ def propsed_wave_3_sample():
                 elif property["Heating"] == "Solid fuel room heaters: Open fire in grate":
                     # Take other properties with room heaters
                     surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")]
+                elif "Boiler" in property["Heating"]:
+                    # Take other properties with boilers
+                    surveyed = surveyed[surveyed["Heating"].str.contains("Boiler")]
                 else:
                     raise Exception("Fix me")
 
@@ -1972,17 +1996,29 @@ def propsed_wave_3_sample():
 
             # Check if we have a postcode match check if surveyed postcode is the same as the property postcode
             if any(surveyed["Postcode"] == property["Postcode"]):
-                surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]]
+                surveyed = surveyed[surveyed["Postcode"] == property["Postcode"]]
 
             if any(surveyed["Postal Region"] == property["Postal Region"]):
-                surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
+                surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
 
             # Take the 5 nearest
-            surveyed_similar = surveyed_similar.head(5)
+            surveyed = surveyed.head(5)
+
+            # # We allow a max distance of 10km
+            # surveyed = surveyed[surveyed["distance_meters"] < 10000]
+            # if surveyed.empty:
+            #     final_missed_matches.append(
+            #         {
+            #             "Address ID": a_id,
+            #             "Confidence Tier": "4 - no similar property, needs survey to confirm",
+            #             "Current EPC Band": "Needs Survey"
+            #         }
+            #     )
+            #     continue
 
             # perform a weighted mean of SAP rating - the closer the better
             expected_sap = np.average(
-                surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1)
+                surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1)
             )
             expected_epc = sap_to_epc(expected_sap)
 
@@ -2153,23 +2189,21 @@ def propsed_wave_3_sample():
     # '1 - Archetype surveyed',
     # '1 - property was surveyed',
     # '2 - same archetype',
-    # '3 - similar property',
-    # '3 - similar property, all areas searched',
-    # '3 - similar property, relaxed conditions'
+    # '3 - similar property, weighted on distance'
+
+    gain_columns = [
+        '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype',
+        '3 - similar property, weighted on distance'
+    ]
     #
     # Loss is the sum of these columns:
     # '4 - no similar property, needs survey to confirm',
     # '5 - EPC C or above', '5 - property was surveyed'
-    geographic_summary["Gain"] = geographic_summary[
-        [
-            '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property',
-            '3 - similar property, all areas searched', '3 - similar property, relaxed conditions'
-        ]
-    ].sum(axis=1)
 
-    geographic_summary["Loss"] = geographic_summary[
-        ['5 - EPC C or above', '5 - property was surveyed']
-    ].sum(axis=1)
+    loss_columns = ['4 - no similar property, needs survey to confirm', '5 - EPC C or above',
+                    '5 - property was surveyed']
+    geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1)
+    geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1)
 
     print(geographic_summary.sum())
 
@@ -2180,30 +2214,82 @@ def propsed_wave_3_sample():
     loss = geographic_summary["Loss"].values
     gain = geographic_summary["Gain"].values
 
-    # Define the coefficients for the objective function (negative because we maximize Gain)
-    c = -gain
+    def optimise(gain, loss, max_loss=250):
 
-    # Define constraints
-    A = [loss]  # Only 1 constraint for now, total Loss
-    b = [250]  # Maximum total Loss allowed
+        # Define the coefficients for the objective function (negative because we maximize Gain)
+        c = -gain
 
-    # Bounds for each variable (select or not select each row, 0 <= x <= 1)
-    bounds = [(0, 1) for _ in gain]
+        # Define constraints
+        A = [loss]  # Only 1 constraint for now, total Loss
+        b = [max_loss]  # Maximum total Loss allowed
 
-    # Solve the problem using linprog with HiGHS solver
-    result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs')
-    if not result.success:
-        raise Exception("Optimization failed")
+        # Bounds for each variable (select or not select each row, 0 <= x <= 1)
+        bounds = [(0, 1) for _ in gain]
 
-    selected_rows = result.x.round().astype(int)  # Rounded to 0 or 1
-    optimal_gain = -result.fun
-    print(optimal_gain)
+        # Solve the problem using linprog with HiGHS solver
+        result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs')
+        if not result.success:
+            raise Exception("Optimization failed")
+
+        selected_rows = result.x.round().astype(int)  # Rounded to 0 or 1
+        optimal_gain = -result.fun
+
+        return selected_rows, optimal_gain
+
+    selected_rows, _ = optimise(gain, loss, 250)
 
     # Select the rows that are selected
     geographic_summary["Selected"] = selected_rows == 1
     geographic_summary[geographic_summary["Selected"]].sum()
-    bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum()
+
+    region_totals = geographic_summary[
+        geographic_summary["Selected"]
+    ][["Gain", "Loss"]].sum()
+
+    # We now see if there are any postcodes that have no loss that can be added
+    unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values
+
+    postcode_summary = results.pivot_table(
+        index='Postcode',
+        columns='Confidence Tier',
+        aggfunc='size',
+        fill_value=0
+    ).reset_index()
+    postcode_summary = postcode_summary.merge(
+        results[["Postcode", "Postal Region"]].drop_duplicates(),
+        how="left", on="Postcode"
+    )
+
+    postcode_summary_unselected_regions = postcode_summary[
+        postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions)
+    ].copy()
+
+    postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1)
+    postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1)
+
+    # Remaining loss allowed
+    remaining_loss_constraint = 250 - region_totals["Loss"]
+    postcode_selected_rows, _ = optimise(
+        gain=postcode_summary_unselected_regions["Gain"].values,
+        loss=postcode_summary_unselected_regions["Loss"].values,
+        max_loss=int(remaining_loss_constraint)
+    )
+
+    postcode_summary_unselected_regions["Selected"] = postcode_selected_rows == 1
+    postcode_summary_unselected_regions[postcode_summary_unselected_regions["Selected"]][["Gain", "Loss"]].sum()
+
+    postcode_optimised_additional_properties = postcode_summary_unselected_regions[
+        postcode_summary_unselected_regions["Selected"]
+    ]
+
+    postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum()
+
+    bid_size = region_totals.sum() + postcode_totals.sum()
     print("Bid Size:", bid_size)
+    total_epc_d_or_below = region_totals["Gain"] + postcode_totals["Gain"]
+    print("Total EPC D or below:", total_epc_d_or_below)
+    total_epc_c = region_totals["Loss"] + postcode_totals["Loss"]
+    print("Total EPC C or above:", total_epc_c)
 
 # if __name__ == "__main__":
 #     main()

From 67f97feb18829a4a2d327335a4a6ed8c8c06e495 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 22:33:42 +0000
Subject: [PATCH 17/31] messing around with street match

---
 .../stonewater/Wave 3 Preparation.py          | 105 ++++++++++++------
 1 file changed, 74 insertions(+), 31 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 6f98c9fd..5ebb06e2 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1637,7 +1637,7 @@ def propsed_wave_3_sample():
 
     # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
     # UPRN
-    asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])]
+    asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])]
     # Clean address ids
     asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
     asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
@@ -1645,12 +1645,13 @@ def propsed_wave_3_sample():
 
     # Create the postal region, taking the first part of the postcode
     asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0]
+    asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"]
     unique_postal_regions = asset_list["Postal Region"].unique()
 
     # Keep just the columns we need
     asset_list = asset_list[
-        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
-         "Heating"]
+        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region",
+         "Property Type", "Wall Type", "Roof Type", "Heating"]
     ]
 
     survey_results = pd.read_excel(
@@ -1853,7 +1854,6 @@ def propsed_wave_3_sample():
                 suffixes=("", "_method2")
             )
         else:
-
             region_assets = region_assets.merge(
                 archetype_surveyed,
                 on="Archetype ID",
@@ -1903,20 +1903,20 @@ def propsed_wave_3_sample():
 
             surveyed = survey_results_with_original_features[
                 (
-                    survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
-                    property["Property Type"].split(":")[0]
+                    survey_results_with_original_features["Property Type"] ==
+                    property["Property Type"]
                 ) &
                 (
-                    survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
-                    property["Wall Type"].split(":")[0]
+                    survey_results_with_original_features["Wall Type"] ==
+                    property["Wall Type"]
                 ) &
                 (
-                    survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
-                    property["Roof Type"].split(":")[0]
+                    survey_results_with_original_features["Roof Type"] ==
+                    property["Roof Type"]
                 ) &
                 (
-                    survey_results_with_original_features["Heating"].str.split(":").str[0] ==
-                    property["Heating"].split(":")[0]
+                    survey_results_with_original_features["Heating"] ==
+                    property["Heating"]
                 )
                 ].copy()
 
@@ -1962,7 +1962,10 @@ def propsed_wave_3_sample():
                 if "Electric" in property["Heating"]:
                     # Take other electric heating systems
                     surveyed = surveyed[surveyed["Heating"].str.contains("Electric")]
-                elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)":
+                elif property["Heating"] in [
+                    "Community Heating Systems: Community boilers only (RdSAP)",
+                    "Community Heating Systems: Community CHP and boilers (RdSAP)"
+                ]:
                     # Take other community heating systems
                     surveyed = surveyed[surveyed["Heating"].str.contains("Community")]
                 elif property["Heating"] == 'Heat Pump: (from database)':
@@ -2001,8 +2004,8 @@ def propsed_wave_3_sample():
             if any(surveyed["Postal Region"] == property["Postal Region"]):
                 surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
 
-            # Take the 5 nearest
-            surveyed = surveyed.head(5)
+            # Take the 3 nearest
+            surveyed = surveyed.head(3)
 
             # # We allow a max distance of 10km
             # surveyed = surveyed[surveyed["distance_meters"] < 10000]
@@ -2176,6 +2179,9 @@ def propsed_wave_3_sample():
 
     results = pd.concat(results)
 
+    # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1)
+    # region = home["Postal Region"].values[0]
+
     # Create a pivot table for counts of Confidence Tier by Postal Region
     geographic_summary = results.pivot_table(
         index='Postal Region',
@@ -2192,7 +2198,9 @@ def propsed_wave_3_sample():
     # '3 - similar property, weighted on distance'
 
     gain_columns = [
-        '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype',
+        '1 - Archetype surveyed',
+        '1 - property was surveyed',
+        '2 - same archetype',
         '3 - similar property, weighted on distance'
     ]
     #
@@ -2200,8 +2208,11 @@ def propsed_wave_3_sample():
     # '4 - no similar property, needs survey to confirm',
     # '5 - EPC C or above', '5 - property was surveyed'
 
-    loss_columns = ['4 - no similar property, needs survey to confirm', '5 - EPC C or above',
-                    '5 - property was surveyed']
+    loss_columns = [
+        '4 - no similar property, needs survey to confirm',
+        '5 - EPC C or above',
+        '5 - property was surveyed'
+    ]
     geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1)
     geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1)
 
@@ -2249,26 +2260,30 @@ def propsed_wave_3_sample():
     # We now see if there are any postcodes that have no loss that can be added
     unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values
 
+    # TODO: Try on street
+
     postcode_summary = results.pivot_table(
-        index='Postcode',
+        index='Street and Region',
         columns='Confidence Tier',
         aggfunc='size',
         fill_value=0
     ).reset_index()
-    postcode_summary = postcode_summary.merge(
-        results[["Postcode", "Postal Region"]].drop_duplicates(),
-        how="left", on="Postcode"
-    )
-
-    postcode_summary_unselected_regions = postcode_summary[
-        postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions)
-    ].copy()
+    # postcode_summary = postcode_summary.merge(
+    #     results[["Postcode", "Postal Region"]].drop_duplicates(),
+    #     how="left", on="Postcode"
+    # )
+    #
+    postcode_summary_unselected_regions = postcode_summary.copy()
+    # postcode_summary_unselected_regions = postcode_summary[
+    #     postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions)
+    # ].copy()
 
     postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1)
     postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1)
 
     # Remaining loss allowed
-    remaining_loss_constraint = 250 - region_totals["Loss"]
+    # remaining_loss_constraint = 230 - region_totals["Loss"]
+    remaining_loss_constraint = 250
     postcode_selected_rows, _ = optimise(
         gain=postcode_summary_unselected_regions["Gain"].values,
         loss=postcode_summary_unselected_regions["Loss"].values,
@@ -2284,12 +2299,40 @@ def propsed_wave_3_sample():
 
     postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum()
 
-    bid_size = region_totals.sum() + postcode_totals.sum()
+    bid_size = postcode_totals.sum()
     print("Bid Size:", bid_size)
-    total_epc_d_or_below = region_totals["Gain"] + postcode_totals["Gain"]
+    total_epc_d_or_below = postcode_totals["Gain"]
     print("Total EPC D or below:", total_epc_d_or_below)
-    total_epc_c = region_totals["Loss"] + postcode_totals["Loss"]
+    total_epc_c = postcode_totals["Loss"]
     print("Total EPC C or above:", total_epc_c)
+    # Total needing a survey
+    total_needing_survey = postcode_optimised_additional_properties[
+        "4 - no similar property, needs survey to confirm"
+    ].sum()
+    print("Total needing survey:", total_needing_survey)
+
+    # Look for postcodes that have no loss
+    unselected_streets = postcode_summary_unselected_regions[
+        ~postcode_summary_unselected_regions["Selected"]
+    ]["Street and Region"].values
+
+    postcode_summary2 = results[
+        results["Street and Region"].isin(unselected_streets)
+    ].pivot_table(
+        index='Postcode',
+        columns='Confidence Tier',
+        aggfunc='size',
+        fill_value=0
+    ).reset_index()
+
+    postcode_summary2["Gain"] = postcode_summary2[gain_columns].sum(axis=1)
+    postcode_summary2["Loss"] = postcode_summary2[loss_columns].sum(axis=1)
+
+    no_loss_postcodes = postcode_summary2[postcode_summary2["Loss"] == 0].sort_values("Gain", ascending=False)
+    total_bid_size = bid_size + no_loss_postcodes["Gain"].sum()
+    print(total_bid_size)
+
+    z = results[results["Confidence Tier"] == "5 - EPC C or above"]
 
 # if __name__ == "__main__":
 #     main()

From efba61c6ac52740d70c51864ea49c0d5623b353d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 22:51:24 +0000
Subject: [PATCH 18/31] tweaking

---
 .../stonewater/Wave 3 Preparation.py          | 121 ++++++++++++------
 1 file changed, 83 insertions(+), 38 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 5ebb06e2..974cd908 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1719,6 +1719,72 @@ def propsed_wave_3_sample():
     # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
     #
 
+    def match_property_to_surveyed(property, survey_results_with_original_features):
+        surveyed = survey_results_with_original_features[
+            (
+                survey_results_with_original_features["Property Type"] ==
+                property["Property Type"]
+            ) &
+            (
+                survey_results_with_original_features["Wall Type"] ==
+                property["Wall Type"]
+            ) &
+            (
+                survey_results_with_original_features["Roof Type"] ==
+                property["Roof Type"]
+            ) &
+            (
+                survey_results_with_original_features["Heating"] ==
+                property["Heating"]
+            )
+            ].copy()
+
+        if not surveyed.empty:
+            return surveyed
+
+        surveyed = survey_results_with_original_features[
+            (
+                survey_results_with_original_features["Property Type"] ==
+                property["Property Type"]
+            ) &
+            (
+                survey_results_with_original_features["Wall Type"] ==
+                property["Wall Type"]
+            ) &
+            (
+                survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+                property["Roof Type"].split(":")[0]
+            ) &
+            (
+                survey_results_with_original_features["Heating"] ==
+                property["Heating"]
+            )
+            ].copy()
+
+        if not surveyed.empty:
+            return surveyed
+
+        surveyed = survey_results_with_original_features[
+            (
+                survey_results_with_original_features["Property Type"] ==
+                property["Property Type"]
+            ) &
+            (
+                survey_results_with_original_features["Wall Type"] ==
+                property["Wall Type"]
+            ) &
+            (
+                survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+                property["Roof Type"].split(":")[0]
+            ) &
+            (
+                survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+                property["Heating"].split(":")[0]
+            )
+            ].copy()
+
+        return surveyed
+
     results = []
     for region in tqdm(unique_postal_regions):
         # Take all of the properties in that region
@@ -1757,6 +1823,7 @@ def propsed_wave_3_sample():
             ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
         if region_surveyed["Archetype ID"].duplicated().sum():
+
             region_surveyed = []
             for arch_id in archetypes:
                 for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
@@ -1765,6 +1832,12 @@ def propsed_wave_3_sample():
                         ].copy()
                     if archetype_data.empty:
                         continue
+                    if archetype_data.shape[0] > 1:
+                        # Look for an exact match, or as close as possible
+                        archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
+                        if not archetype_data_filtered.empty:
+                            archetype_data = archetype_data_filtered
+
                     archetype_data["distance_meters"] = haversine(
                         lat1=property.latitude, lon1=property.longitude,
                         lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
@@ -1899,28 +1972,15 @@ def propsed_wave_3_sample():
             # This means that this archetype was never surveyed and so we need to find a sufficiently similar property
         final_missed_matches = []
         for a_id in missed_addressids:
+
+            match_type = "3 - compared to similar properties"
+
             property = asset_list[asset_list["Address ID"] == a_id].squeeze()
 
-            surveyed = survey_results_with_original_features[
-                (
-                    survey_results_with_original_features["Property Type"] ==
-                    property["Property Type"]
-                ) &
-                (
-                    survey_results_with_original_features["Wall Type"] ==
-                    property["Wall Type"]
-                ) &
-                (
-                    survey_results_with_original_features["Roof Type"] ==
-                    property["Roof Type"]
-                ) &
-                (
-                    survey_results_with_original_features["Heating"] ==
-                    property["Heating"]
-                )
-                ].copy()
+            surveyed = match_property_to_surveyed(property, survey_results_with_original_features)
 
             if surveyed.empty:
+                match_type = "3 - compared to similar properties, relaxed"
                 # In this case, we do one additional check where we filter on everything the same apart from heating,
                 # where we do a slightly more rough match
                 surveyed = survey_results_with_original_features[
@@ -2026,14 +2086,12 @@ def propsed_wave_3_sample():
             expected_epc = sap_to_epc(expected_sap)
 
             if expected_epc in ["C", "B", "A"]:
-                tier = "5 - EPC C or above"
-            else:
-                tier = "3 - similar property, weighted on distance"
+                match_type = "5 - EPC C or above"
 
             final_missed_matches.append(
                 {
                     "Address ID": a_id,
-                    "Confidence Tier": tier,
+                    "Confidence Tier": match_type,
                     "Current EPC Band": expected_epc
                 }
             )
@@ -2197,22 +2255,9 @@ def propsed_wave_3_sample():
     # '2 - same archetype',
     # '3 - similar property, weighted on distance'
 
-    gain_columns = [
-        '1 - Archetype surveyed',
-        '1 - property was surveyed',
-        '2 - same archetype',
-        '3 - similar property, weighted on distance'
-    ]
-    #
-    # Loss is the sum of these columns:
-    # '4 - no similar property, needs survey to confirm',
-    # '5 - EPC C or above', '5 - property was surveyed'
+    gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x])
+    loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x])
 
-    loss_columns = [
-        '4 - no similar property, needs survey to confirm',
-        '5 - EPC C or above',
-        '5 - property was surveyed'
-    ]
     geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1)
     geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1)
 
@@ -2283,7 +2328,7 @@ def propsed_wave_3_sample():
 
     # Remaining loss allowed
     # remaining_loss_constraint = 230 - region_totals["Loss"]
-    remaining_loss_constraint = 250
+    remaining_loss_constraint = 220
     postcode_selected_rows, _ = optimise(
         gain=postcode_summary_unselected_regions["Gain"].values,
         loss=postcode_summary_unselected_regions["Loss"].values,

From 294506853dd32fb9aa21ce6500d6eebed7e41be6 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 18 Nov 2024 18:24:26 +0000
Subject: [PATCH 19/31] adding in new features

---
 etl/customers/aiha/bid_numbers.py             | 18 +++++-
 etl/customers/remote_assessments/app.py       |  1 +
 .../stonewater/Wave 3 Preparation.py          | 59 +++++++++++++++++--
 3 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/etl/customers/aiha/bid_numbers.py b/etl/customers/aiha/bid_numbers.py
index 96859f99..b371e2e5 100644
--- a/etl/customers/aiha/bid_numbers.py
+++ b/etl/customers/aiha/bid_numbers.py
@@ -52,6 +52,20 @@ aiha_wave_3_features = aiha_original_asset_data[
 wall_type_breakdown = aiha_wave_3_features["Wall type"].value_counts()
 property_type_breakdown = aiha_wave_3_features.groupby(["Property type", "floor"]).size().reset_index()
 
+aiha_wave_3_features[aiha_wave_3_features["Property type"] == "Flat"][["Street address", "Postcode"]]
+
+# 4   Yetev Lev Court   ...  Semi-Detached     mid  - Medium
+# B    86 Bethune Road  ...    Mid-Terrace     top. - Low
+# A    80 Bethune Road  ...    Mid-Terrace  ground. - Low
+# B    80 Bethune Road  ...             \n      \n  - Low
+# A   9 Clapton Common  ...  Semi-Detached  ground. - Low
+# C   9 Clapton Common  ...    End-Terrace      \n. - Low
+# B      89 Manor Road  ...             \n      \n. - Low
+# A  6 Northfield Road  ...       Detached     top. - Low
+# 13 Northfield Rd  ...  Semi-Detached      \n      - Low
+# A      73 Manor Road  ...    End-Terrace      \n  - Low
+# B      73 Manor Road  ...       Detached     top  - Low
+
 # Hornsey data - contained in original asset list
 hornsey_asset_list = pd.read_excel(
     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing "
@@ -88,5 +102,5 @@ caha_epc_data = pd.read_excel(
     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_extracted_property_data.xlsx"
 )
 
-caha_epc_data["property_type"].value_counts()
-caha_epc_data["wall_type"].value_counts()
+caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["property_type"].value_counts()
+caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["wall_type"].value_counts()
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index 33015d87..59e0e868 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -17,6 +17,7 @@ def app():
             "address": "5, Lynton Street",
             "postcode": "DE22 3RW"
         }
+
     ]
     asset_list = pd.DataFrame(asset_list)
 
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 974cd908..81b5915f 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -6,6 +6,7 @@ import numpy as np
 from tqdm import tqdm
 from collections import Counter
 from scipy.optimize import linprog
+from utils.s3 import read_pickle_from_s3
 
 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
 SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
@@ -1264,7 +1265,7 @@ def main():
         stonewater_data[c] = stonewater_data[c].astype(str)
 
     # Save this data to excel
-    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V2.xlsx", index=False)
+    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False)
 
     cost_sheet = [
         {
@@ -1654,17 +1655,66 @@ def propsed_wave_3_sample():
          "Property Type", "Wall Type", "Roof Type", "Heating"]
     ]
 
+    # Updated packages: to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False)
     survey_results = pd.read_excel(
         os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"),
         header=13,
         sheet_name="Modelled Packages"
     )
 
+    additional_survey_data = pd.read_excel(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"),
+        header=0
+    )
+    survey_results = survey_results.merge(
+        additional_survey_data[
+            [
+                "Address ID",
+                "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness",
+                "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
+                "Main Building Alternative Wall Thickness"
+            ]
+        ].rename(columns={"Main Wall Insulation_x": "Main Wall Insulation Type"}),
+        how="left",
+        on="Address ID"
+    )
+
     # TOOD: We probably want the actual surveyed wall, roof, heating type
     survey_results = survey_results[
-        ["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"]
-    ]
-    survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
+        [
+            "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode",
+            "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness",
+            "Existing Primary Heating System",
+            "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness",
+            "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
+            "Main Building Alternative Wall Thickness"
+        ]
+    ].rename(
+        columns={
+            "Existing Primary Heating System": "Surveyed Primary Heating System"
+        }
+    )
+
+    # Concatenate from the wall information
+    survey_results["Surveyed: Wall Type"] = survey_results["Main Wall Type"] + ": " + survey_results[
+        "Main Wall Insulation Type"]
+    # Alternative wall
+    survey_results["Survey: Main Alternative Wall"] = (
+        survey_results["Main Building Alternative Wall Type"] + ": " + survey_results[
+        "Main Building Alternative Wall Insulation"]
+    )
+    # Roof information
+    survey_results["Survey: Type"] = survey_results["Main Roof Type"] + ": " + survey_results[
+        "Main Roof Insulation"] + ": " + survey_results["Main Roof Insulation Thickness"].astype(str)
+
+    # Drop the individual columns:
+    survey_results = survey_results.drop(
+        columns=[
+            "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness",
+            "Main Wall Type", "Main Wall Insulation Type",
+            "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation"
+        ]
+    )
 
     survey_results_with_original_features = survey_results.merge(
         asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
@@ -1676,7 +1726,6 @@ def propsed_wave_3_sample():
         raise ValueError("Something went wrong")
 
     # We get longitude & Latitude
-    from utils.s3 import read_pickle_from_s3
     archetyping_spatial_features = read_pickle_from_s3(
         bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
     )

From 377d9929e418073567b6af8f589eb5fe58e92a1e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 18 Nov 2024 19:21:35 +0000
Subject: [PATCH 20/31] cleaning roof extraction

---
 .../stonewater/Wave 3 Preparation.py          | 100 +++++++++++++-----
 1 file changed, 72 insertions(+), 28 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 81b5915f..aa9e4488 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -291,26 +291,11 @@ def extract_summary_report(pdf_path):
         data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
         data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
 
-        roof_section = re.search(r"8\.0 Roofs:\n(.*?)\n9\.0 Floors:", text, re.DOTALL)
-        roof_text = roof_section.group(1).strip()
-        roof_type_match = re.search(r"Type\s*([A-Za-z0-9\s]+)", roof_text)
-        data["Main Roof Type"] = roof_type_match.group(1).strip() if roof_type_match else None
-
-        # Check if "Insulation" exists between Type and Insulation Thickness
-        insulation_search = re.search(
-            r"Type\s+.*?\n(Insulation\s+(.*?)\n)?(Insulation Thickness\s+(.*?)\n)", roof_text, re.DOTALL
-        )
-
-        if insulation_search:
-            # Insulation match will be present if it exists, otherwise it will be None
-            insulation_match = insulation_search.group(2)  # Optional group for Insulation
-            insulation_thickness_match = insulation_search.group(4)  # Required group for Insulation Thickness
-
-            # Populate insulation fields
-            data["Main Roof Insulation"] = insulation_match.strip() if insulation_match else None
-            data["Main Roof Insulation Thickness"] = (
-                insulation_thickness_match.strip() if insulation_thickness_match else None
-            )
+        extracted_roof_data = extract_roof_details_summary(text)
+        main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0]
+        data["Main Roof Type"] = main_roof_data["Roof Type"]
+        data["Main Roof Insulation"] = main_roof_data["Roof Insulation"]
+        data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"]
 
         walls_data = extract_wall_details_summary(text)
         # Get the main building wall data
@@ -593,6 +578,54 @@ def extract_roof_details_epr(text):
     return roof_data
 
 
+def extract_roof_details_summary(text):
+    """
+    Extracts roof type, insulation, and insulation thickness for each building part
+    in the 8.0 Roofs section of the summary report.
+    """
+    # Define data structure to hold results
+    roof_data = []
+
+    # Locate the entire 8.0 Roofs section
+    roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL)
+    if not roof_section_match:
+        return roof_data  # Return empty if no roof section is found
+
+    # Extract the roof section and append "9.0 Floors:" as the boundary
+    roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:"
+
+    # Define pattern to match each building part's roof entry
+    building_part_pattern = re.compile(
+        r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n"  # Matches each building part label
+        r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))"  # Matches Roof Type until the next field, label, or end
+        r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?"  # Optional Insulation
+        r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?",  # Optional Insulation Thickness
+        re.DOTALL
+    )
+
+    # Extract each building part's data
+    for match in building_part_pattern.finditer(roof_section):
+        part_name = match.group(1).strip()  # Building part label
+        roof_type = match.group(2).strip()  # Roof Type
+        roof_insulation = match.group(3).strip() if match.group(3) else None  # Optional Insulation
+        roof_insulation_thickness = match.group(4).strip() if match.group(4) else None  # Optional Thickness
+
+        # Cleaning to handle annoying cases when it comes out like this:
+        # 'A Another dwelling above\n1st Extension'
+        if roof_type.startswith("A Another dwelling above"):
+            roof_type = "A Another dwelling above"
+
+        # Store results for this building part
+        roof_data.append({
+            "Building Part": part_name,
+            "Roof Type": roof_type,
+            "Roof Insulation": roof_insulation,
+            "Roof Insulation Thickness": roof_insulation_thickness,
+        })
+
+    return roof_data
+
+
 def extract_wall_details_epr(text):
     """
     Extracts wall type, insulation, dry-lining, and thickness for each building part
@@ -1691,21 +1724,21 @@ def propsed_wave_3_sample():
         ]
     ].rename(
         columns={
-            "Existing Primary Heating System": "Surveyed Primary Heating System"
+            "Existing Primary Heating System": "Survey: Primary Heating System"
         }
     )
 
     # Concatenate from the wall information
-    survey_results["Surveyed: Wall Type"] = survey_results["Main Wall Type"] + ": " + survey_results[
-        "Main Wall Insulation Type"]
+    survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
+        "Main Wall Insulation Type"].astype(str)
     # Alternative wall
     survey_results["Survey: Main Alternative Wall"] = (
-        survey_results["Main Building Alternative Wall Type"] + ": " + survey_results[
-        "Main Building Alternative Wall Insulation"]
+        survey_results["Main Building Alternative Wall Type"].astype(str) + ": " + survey_results[
+        "Main Building Alternative Wall Insulation"].astype(str)
     )
     # Roof information
-    survey_results["Survey: Type"] = survey_results["Main Roof Type"] + ": " + survey_results[
-        "Main Roof Insulation"] + ": " + survey_results["Main Roof Insulation Thickness"].astype(str)
+    survey_results["Survey: Main Roof Type"] = survey_results["Main Roof Type"].astype(str) + ": " + survey_results[
+        "Main Roof Insulation"].astype(str) + ": " + survey_results["Main Roof Insulation Thickness"].astype(str)
 
     # Drop the individual columns:
     survey_results = survey_results.drop(
@@ -1834,6 +1867,11 @@ def propsed_wave_3_sample():
 
         return surveyed
 
+    survey_attribute_columns = [
+        "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
+        'Survey: Primary Heating System'
+    ]
+
     results = []
     for region in tqdm(unique_postal_regions):
         # Take all of the properties in that region
@@ -1845,7 +1883,8 @@ def propsed_wave_3_sample():
         ]
 
         region_assets = region_assets.merge(
-            exact_surveyed[["Address ID", "Current EPC Band"]],
+            exact_surveyed[
+                ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns],
             on="Address ID",
             how="left"
         )
@@ -2286,6 +2325,11 @@ def propsed_wave_3_sample():
 
     results = pd.concat(results)
 
+    # Check if there are missings in current epc band, current sap rating or any of the survey attributes
+    for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns:
+        if pd.isnull(results[c]).sum():
+            raise Exception("Something went wrong")
+
     # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1)
     # region = home["Postal Region"].values[0]
 

From a7857c0375949f5d45d47afe41f59e07de883e71 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 18 Nov 2024 20:30:57 +0000
Subject: [PATCH 21/31] pulling out data from best match

---
 .../stonewater/Wave 3 Preparation.py          | 111 ++++++++++--------
 etl/find_my_epc/RetrieveFindMyEpc.py          |   1 +
 etl/route_march_data_pull/app.py              |  65 ++++------
 3 files changed, 83 insertions(+), 94 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index aa9e4488..08236d5b 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1727,7 +1727,7 @@ def propsed_wave_3_sample():
             "Existing Primary Heating System": "Survey: Primary Heating System"
         }
     )
-
+    survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
     # Concatenate from the wall information
     survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
         "Main Wall Insulation Type"].astype(str)
@@ -1872,6 +1872,8 @@ def propsed_wave_3_sample():
         'Survey: Primary Heating System'
     ]
 
+    survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy()
+
     results = []
     for region in tqdm(unique_postal_regions):
         # Take all of the properties in that region
@@ -1884,10 +1886,14 @@ def propsed_wave_3_sample():
 
         region_assets = region_assets.merge(
             exact_surveyed[
-                ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns],
+                ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [
+                    "Survey: Matching Address ID"
+                ]
+                ],
             on="Address ID",
             how="left"
         )
+        region_assets['Distance to Closest Match (m)'] = 0
 
         # Label the tier 1 properties
         region_assets["Confidence Tier"] = None
@@ -1901,61 +1907,62 @@ def propsed_wave_3_sample():
             "5 - property was surveyed", region_assets["Confidence Tier"]
         )
 
-        archetypes = region_assets[
+        archetype_ids = region_assets[
             pd.isnull(region_assets["Confidence Tier"])
         ]["Archetype ID"].unique()
         # We get the properties that have been surveyed
-        region_surveyed = survey_results[
-            survey_results["Archetype ID"].isin(archetypes) &
-            (survey_results["Postal Region"] == region)
-            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
 
-        if region_surveyed["Archetype ID"].duplicated().sum():
+        region_surveyed = []
+        for arch_id in archetype_ids:
+            for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                archetype_data = survey_results_with_original_features[
+                    survey_results["Archetype ID"] == arch_id
+                    ].copy()
+                if archetype_data.empty:
+                    continue
+                if archetype_data.shape[0] > 1:
+                    # Look for an exact match, or as close as possible
+                    archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
+                    if not archetype_data_filtered.empty:
+                        archetype_data = archetype_data_filtered
 
-            region_surveyed = []
-            for arch_id in archetypes:
-                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
-                    archetype_data = survey_results_with_original_features[
-                        survey_results["Archetype ID"] == arch_id
-                        ].copy()
-                    if archetype_data.empty:
-                        continue
-                    if archetype_data.shape[0] > 1:
-                        # Look for an exact match, or as close as possible
-                        archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
-                        if not archetype_data_filtered.empty:
-                            archetype_data = archetype_data_filtered
+                archetype_data["distance_meters"] = haversine(
+                    lat1=property.latitude, lon1=property.longitude,
+                    lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+                )
+                expected_sap = np.average(
+                    archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+                )
+                expected_epc = sap_to_epc(expected_sap)
 
-                    archetype_data["distance_meters"] = haversine(
-                        lat1=property.latitude, lon1=property.longitude,
-                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
-                    )
-                    expected_sap = np.average(
-                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
-                    )
-                    expected_epc = sap_to_epc(expected_sap)
-                    region_surveyed.append(
-                        {
-                            "Archetype ID": arch_id,
-                            "Address ID": property["Address ID"],
-                            "Current EPC Band": expected_epc
-                        }
-                    )
+                # We take the features of the closest matching property
+                closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0]
 
-            region_surveyed = pd.DataFrame(region_surveyed)
-            region_assets = region_assets.merge(
-                region_surveyed,
-                on=["Archetype ID", "Address ID"],
-                how="left",
-                suffixes=("", "_method1")
-            )
-        else:
-            region_assets = region_assets.merge(
-                region_surveyed,
-                on="Archetype ID",
-                how="left",
-                suffixes=("", "_method1")
-            )
+                region_surveyed.append(
+                    {
+                        "Archetype ID": arch_id,
+                        "Address ID": property["Address ID"],
+                        "Current EPC Band": expected_epc,
+                        "Current SAP Rating": expected_sap,
+                        'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
+                        'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"],
+                        'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
+                        'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
+                        "Survey: Matching Address ID": closest_match["Address ID"],
+                        'Distance to Closest Match (m)': closest_match["distance_meters"]
+                    }
+                )
+
+        region_surveyed = pd.DataFrame(region_surveyed)
+        starting_shape = region_assets.shape[0]
+        region_assets = region_assets.merge(
+            region_surveyed,
+            on=["Archetype ID", "Address ID"],
+            how="left",
+            suffixes=("", "_method1")
+        )
+        if region_assets.shape[0] != starting_shape:
+            raise ValueError("Something went wrong")
 
         # Label the tier 1 properties
         region_assets["Confidence Tier"] = np.where(
@@ -2326,7 +2333,9 @@ def propsed_wave_3_sample():
     results = pd.concat(results)
 
     # Check if there are missings in current epc band, current sap rating or any of the survey attributes
-    for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns:
+    for c in (
+        ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
+        survey_attribute_columns):
         if pd.isnull(results[c]).sum():
             raise Exception("Something went wrong")
 
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
index 913a04b8..d5a5134f 100644
--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -269,6 +269,7 @@ class RetrieveFindMyEpc:
             "Loft insulation": ["loft_insulation"],
             "Solar photovoltaic (PV) panels": ["solar_pv"],
             "Party wall insulation": ["party_wall_insulation"],
+            'Draught proofing': ["draught_proofing"],
         }
 
         survey = True
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index f24c5bb2..1e478b0c 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -23,41 +23,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
     epc_data = []
     errors = []
+    no_epc = []
     for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
-        postcode = home[postcode_column]
-        house_number = home[address1_column]
-        full_address = home[fulladdress_column]
-
-        searcher = SearchEpc(
-            address1=str(house_number),
-            postcode=postcode,
-            auth_token=EPC_AUTH_TOKEN,
-            os_api_key="",
-            property_type=None,
-            fast=True,
-            full_address=full_address,
-            max_retries=5
-        )
-        # Force the skipping of estimating the EPC
-        searcher.ordnance_survey_client.property_type = None
-        searcher.ordnance_survey_client.built_form = None
-
-        searcher.find_property(skip_os=True)
-        if searcher.newest_epc is None:
-            continue
-
-        # Look for EPC recommendatons
-        try:
-            property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
-        except:
-            property_recommendations = {"rows": []}
-
-        # Retrieve data from FindMyEPC
-        find_epc_searcher = RetrieveFindMyEpc(
-            address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
-        )
-        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
-        time.sleep(np.random.uniform(0.1, 1))
         try:
             postcode = home[postcode_column]
             house_number = home[address1_column]
@@ -79,6 +46,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
 
             searcher.find_property(skip_os=True)
             if searcher.newest_epc is None:
+                no_epc.append(home["row_id"])
                 continue
 
             # Look for EPC recommendatons
@@ -106,7 +74,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
             errors.append(home["row_id"])
             time.sleep(5)
 
-    return epc_data, errors
+    return epc_data, errors, no_epc
 
 
 def extract_address1(asset_list, full_address_col, method="first_two_words"):
@@ -140,26 +108,37 @@ def app():
     Property UPRN
 
     """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/"
-    DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx"
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
+    DATA_FILENAME = "Bromford programme review.xlsx"
+    SHEET_NAME = "Bromford"
     POSTCODE_COLUMN = "Postcode"
-    FULLADDRESS_COLUMN = "Address"
-    ADDRESS1_COLUMN = None
+    FULLADDRESS_COLUMN = None
+    ADDRESS1_COLUMN = "No."
     ADDRESS1_METHOD = "first_two_words"
+    ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]
 
-    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0)
+    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
+    asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
     asset_list["row_id"] = asset_list.index
 
     # We clean up portential non-breaking spaces, and double spaces
     for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
+        asset_list[col] = asset_list[col].astype(str)
         asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
         asset_list[col] = asset_list[col].str.replace('  ', ' ', regex=False)
 
     if ADDRESS1_COLUMN is None:
         ADDRESS1_COLUMN = "address1_extracted"
-        asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD)
+        asset_list = extract_address1(
+            asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD
+        )
 
-    epc_data, errors = get_data(
+    if FULLADDRESS_COLUMN is None:
+        FULLADDRESS_COLUMN = "fulladdress_extracted"
+        # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
+        asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
+
+    epc_data, errors, no_epc = get_data(
         asset_list=asset_list,
         fulladdress_column=FULLADDRESS_COLUMN,
         address1_column=ADDRESS1_COLUMN,
@@ -168,7 +147,7 @@ def app():
 
     # We now retrieve any failed properties
     asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
-    epc_data_failed, _ = get_data(
+    epc_data_failed, _, _ = get_data(
         asset_list=asset_list_failed,
         fulladdress_column=FULLADDRESS_COLUMN,
         address1_column=ADDRESS1_COLUMN,

From 7accbded137918ba4e38c5b6ed79703b0e727e3d Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 18 Nov 2024 21:38:00 +0000
Subject: [PATCH 22/31] debugging find epc pull

---
 etl/find_my_epc/RetrieveFindMyEpc.py | 21 ++++++++++++++++++++-
 etl/route_march_data_pull/app.py     | 22 ++++++++++++++++++----
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
index d5a5134f..ac0e8235 100644
--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -126,6 +126,7 @@ class RetrieveFindMyEpc:
             # Find all h3 headers for each step and extract their related information
             step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m')
             previous_sap_score = current_sap
+            previous_epc = current_rating.split(' ')[-6]
             for step_num, step_header in enumerate(step_headers, start=1):
                 # Extract the step title (the measure)
                 measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "")
@@ -138,7 +139,11 @@ class RetrieveFindMyEpc:
                 # Check if the potential rating div is found
                 if potential_rating_div:
                     # Extract the rating text within the SVG text element
-                    rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold').text.strip()
+                    extracted_rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold')
+                    if extracted_rating_text is not None:
+                        rating_text = extracted_rating_text.text.strip()
+                    else:
+                        rating_text = " ".join([str(previous_sap_score), previous_epc])
                     # Parse the rating text to separate the numeric rating and EPC letter
                     new_rating = int(rating_text.split()[0])
                     new_epc = rating_text.split()[1]
@@ -152,6 +157,7 @@ class RetrieveFindMyEpc:
                         "sap_points": new_rating - previous_sap_score
                     })
                     previous_sap_score = new_rating
+                    previous_epc = new_epc
 
         # Search for the assessment informaton
         assessment_information = address_res.find('div', {'id': 'information'})
@@ -270,6 +276,19 @@ class RetrieveFindMyEpc:
             "Solar photovoltaic (PV) panels": ["solar_pv"],
             "Party wall insulation": ["party_wall_insulation"],
             'Draught proofing': ["draught_proofing"],
+            "Roof insulation recommendation": [],
+            "Cavity wall insulation recommendation": [],
+            "Windows draught proofing": [],
+            "Low energy lighting for all fixed outlets": ["low_energy_lighting"],
+            "Cylinder thermostat recommendation": [],
+            "Heating controls recommendation": [],
+            "Replace boiler with Band A condensing boiler": [],
+            "Solar panel recommendation": [],
+            "Double glazing recommendation": [],
+            "Solid wall insulation recommendation": [],
+            "Fuel change recommendation": [],
+            "PV Cells recommendation": [],
+            "Replacement glazing units": ["double_glazing"],
         }
 
         survey = True
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 1e478b0c..80caefc9 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -1,5 +1,6 @@
 import os
 import time
+from idlelib.iomenu import errors
 
 import pandas as pd
 import numpy as np
@@ -21,6 +22,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
 def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
+    home = asset_list[asset_list["row_id"].isin(errors)].head(1).tail(1).squeeze()
+
     epc_data = []
     errors = []
     no_epc = []
@@ -56,10 +59,21 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
                 property_recommendations = {"rows": []}
 
             # Retrieve data from FindMyEPC
-            find_epc_searcher = RetrieveFindMyEpc(
-                address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
-            )
-            find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+            try:
+                find_epc_searcher = RetrieveFindMyEpc(
+                    address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
+                )
+                find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+            except ValueError as e:
+                if "No EPC found" in str(e):
+                    find_epc_searcher = RetrieveFindMyEpc(
+                        address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
+                    )
+                    find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+                else:
+                    find_epc_data = {}
+            except Exception as e:
+                raise Exception(f"Error retrieving FindMyEPC data: {e}")
             time.sleep(np.random.uniform(0.1, 1))
 
             epc = {

From 6eb52a509ebb8a110ca09533e4cba85b66edacf2 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 18 Nov 2024 21:38:40 +0000
Subject: [PATCH 23/31] removing error line

---
 etl/route_march_data_pull/app.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 80caefc9..d9f6bf43 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -22,8 +22,6 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 
 
 def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
-    home = asset_list[asset_list["row_id"].isin(errors)].head(1).tail(1).squeeze()
-
     epc_data = []
     errors = []
     no_epc = []

From ac9b7b37300204c83f862871ebd511208625978b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 18 Nov 2024 22:08:10 +0000
Subject: [PATCH 24/31] updating methdology for matching

---
 .../stonewater/Wave 3 Preparation.py          | 193 +++++++++++-------
 1 file changed, 114 insertions(+), 79 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 08236d5b..f74dc19d 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1867,6 +1867,19 @@ def propsed_wave_3_sample():
 
         return surveyed
 
+    def fill_survey_columns(region_assets, suffix):
+        for col in [
+            'Current EPC Band', 'Current SAP Rating',
+            'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
+            'Survey: Main Roof Type', 'Survey: Primary Heating System',
+            'Survey: Matching Address ID', 'Distance to Closest Match (m)'
+        ]:
+            region_assets[col] = np.where(
+                pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]),
+                region_assets[col + suffix], region_assets[col]
+            )
+        return region_assets
+
     survey_attribute_columns = [
         "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
         'Survey: Primary Heating System'
@@ -1920,6 +1933,14 @@ def propsed_wave_3_sample():
                     ].copy()
                 if archetype_data.empty:
                     continue
+
+                match_type = "2 - same archetype"
+                if any(archetype_data["Postal Region"] == property["Postal Region"]):
+                    match_type = "1 - same archetype, same postal region"
+                    archetype_data = archetype_data[
+                        archetype_data["Postal Region"] == property["Postal Region"]
+                        ]
+
                 if archetype_data.shape[0] > 1:
                     # Look for an exact match, or as close as possible
                     archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
@@ -1949,11 +1970,21 @@ def propsed_wave_3_sample():
                         'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
                         'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
                         "Survey: Matching Address ID": closest_match["Address ID"],
-                        'Distance to Closest Match (m)': closest_match["distance_meters"]
+                        'Distance to Closest Match (m)': closest_match["distance_meters"],
+                        "Match Type": match_type
                     }
                 )
-
         region_surveyed = pd.DataFrame(region_surveyed)
+
+        if region_surveyed.empty:
+            region_surveyed = pd.DataFrame(
+                columns=[
+                    "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
+                    'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
+                    'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
+                ]
+            )
+
         starting_shape = region_assets.shape[0]
         region_assets = region_assets.merge(
             region_surveyed,
@@ -1968,95 +1999,99 @@ def propsed_wave_3_sample():
         region_assets["Confidence Tier"] = np.where(
             region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
             pd.isnull(region_assets["Confidence Tier"]),
-            "1 - Archetype surveyed", region_assets["Confidence Tier"]
+            "1 - Archetype surveyed in region", region_assets["Confidence Tier"]
         )
 
-        region_assets["Current EPC Band"] = np.where(
-            pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]),
-            region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"]
-        )
         # Handle EPC C
         region_assets["Confidence Tier"] = np.where(
-            region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
+            region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) &
+            pd.isnull(region_assets["Confidence Tier"]),
             "5 - EPC C or above", region_assets["Confidence Tier"]
         )
 
-        region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
-        # TODO: Turn into a function
-        missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
+        region_assets = fill_survey_columns(region_assets, suffix="_method1")
 
-        archetype_surveyed = survey_results[
-            survey_results["Archetype ID"].isin(missed_archetypes)
-        ][["Archetype ID", "Current EPC Band"]].drop_duplicates()
+        method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")]
+        region_assets = region_assets.drop(columns=method_1_columns)
 
-        if archetype_surveyed["Archetype ID"].duplicated().sum():
+        missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"])
 
-            archetype_surveyed = []
-            for arch_id in missed_archetypes:
-                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
-                    archetype_data = survey_results_with_original_features[
-                        survey_results["Archetype ID"] == arch_id
-                        ].copy()
-                    if archetype_data.empty:
-                        continue
-                    archetype_data["distance_meters"] = haversine(
-                        lat1=property.latitude, lon1=property.longitude,
-                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
-                    )
-                    expected_sap = np.average(
-                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
-                    )
-                    expected_epc = sap_to_epc(expected_sap)
-                    archetype_surveyed.append(
-                        {
-                            "Archetype ID": arch_id,
-                            "Address ID": property["Address ID"],
-                            "Current EPC Band": expected_epc
-                        }
-                    )
-            archetype_surveyed = pd.DataFrame(archetype_surveyed)
-            region_assets = region_assets.merge(
-                archetype_surveyed,
-                on=["Archetype ID", "Address ID"],
-                how="left",
-                suffixes=("", "_method2")
-            )
-        else:
-            region_assets = region_assets.merge(
-                archetype_surveyed,
-                on="Archetype ID",
-                how="left",
-                suffixes=("", "_method2")
-            )
-
-        region_assets["Confidence Tier"] = np.where(
-            region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
-                region_assets["Confidence Tier"]),
-            "2 - same archetype", region_assets["Confidence Tier"]
-        )
-
-        region_assets["Current EPC Band"] = np.where(
-            pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]),
-            region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"]
-        )
-
-        region_assets = region_assets.drop(columns=["Current EPC Band_method2"])
+        archetype_surveyed = []
+        for arch_id in missed_archetypes:
+            for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                archetype_data = survey_results_with_original_features[
+                    survey_results["Archetype ID"] == arch_id
+                    ].copy()
+                if archetype_data.empty:
+                    continue
+                raise Exception("IMPLEMENT ME")
+        #         archetype_data["distance_meters"] = haversine(
+        #             lat1=property.latitude, lon1=property.longitude,
+        #             lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+        #         )
+        #         expected_sap = np.average(
+        #             archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+        #         )
+        #         expected_epc = sap_to_epc(expected_sap)
+        #         archetype_surveyed.append(
+        #             {
+        #                 "Archetype ID": arch_id,
+        #                 "Address ID": property["Address ID"],
+        #                 "Current EPC Band": expected_epc
+        #             }
+        #         )
+        # archetype_surveyed = pd.DataFrame(archetype_surveyed)
+        # if archetype_surveyed.empty:
+        #     archetype_surveyed = pd.DataFrame(
+        #         columns=[
+        #             "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
+        #             'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
+        #             'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
+        #         ]
+        #     )
+        #
+        # region_assets = region_assets.merge(
+        #     archetype_surveyed,
+        #     on=["Archetype ID", "Address ID"],
+        #     how="left",
+        #     suffixes=("", "_method2")
+        # )
+        #
+        # region_assets["Confidence Tier"] = np.where(
+        #     region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
+        #         region_assets["Confidence Tier"]),
+        #     "2 - same archetype", region_assets["Confidence Tier"]
+        # )
+        #
+        # for col in [
+        #     'Current EPC Band', 'Current SAP Rating',
+        #     'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
+        #     'Survey: Main Roof Type', 'Survey: Primary Heating System',
+        #     'Survey: Matching Address ID', 'Distance to Closest Match (m)'
+        # ]:
+        #     region_assets[col] = np.where(
+        #         pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]),
+        #         region_assets[col + "_method2"], region_assets[col]
+        #     )
+        #
+        # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")]
+        # region_assets = region_assets.drop(columns=method_2_columns)
 
         # We label EPC C properties
-        region_assets["Confidence Tier"] = np.where(
-            region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
-            "5 - EPC C or above", region_assets["Confidence Tier"]
-        )
-
-        region_assets["Confidence Tier"] = np.where(
-            region_assets["Archetype ID"] == "EPC C OR ABOVE",
-            "5 - EPC C or above", region_assets["Confidence Tier"]
-        )
-
-        region_assets["Current EPC Band"] = np.where(
-            region_assets["Archetype ID"] == "EPC C OR ABOVE",
-            "C", region_assets["Current EPC Band"]
-        )
+        # region_assets["Confidence Tier"] = np.where(
+        #     region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
+        #     "5 - EPC C or above", region_assets["Confidence Tier"]
+        # )
+        #
+        # region_assets["Confidence Tier"] = np.where(
+        #     region_assets["Archetype ID"] == "EPC C OR ABOVE",
+        #     "5 - EPC C or above", region_assets["Confidence Tier"]
+        # )
+        #
+        # region_assets["Current EPC Band"] = np.where(
+        #     region_assets["Archetype ID"] == "EPC C OR ABOVE",
+        #     "C", region_assets["Current EPC Band"]
+        # )
 
         missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
 

From 5d5001fec3114eab4ba84e7fc0e40270ec017d35 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 18 Nov 2024 22:47:39 +0000
Subject: [PATCH 25/31] added de-duping

---
 .../stonewater/Wave 3 Preparation.py          | 221 ++++++------------
 etl/find_my_epc/RetrieveFindMyEpc.py          |   6 +
 etl/route_march_data_pull/app.py              |   7 +
 3 files changed, 85 insertions(+), 149 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index f74dc19d..744b3400 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1803,21 +1803,26 @@ def propsed_wave_3_sample():
 
     def match_property_to_surveyed(property, survey_results_with_original_features):
         surveyed = survey_results_with_original_features[
+            (
+                survey_results_with_original_features["Postal Region"] ==
+                property["Postal Region"]
+            ) &
             (
                 survey_results_with_original_features["Property Type"] ==
                 property["Property Type"]
+            )
+            &
+            (
+                survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+                property["Wall Type"].split(":")[0]
             ) &
             (
-                survey_results_with_original_features["Wall Type"] ==
-                property["Wall Type"]
+                survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+                property["Roof Type"].split(":")[0]
             ) &
             (
-                survey_results_with_original_features["Roof Type"] ==
-                property["Roof Type"]
-            ) &
-            (
-                survey_results_with_original_features["Heating"] ==
-                property["Heating"]
+                survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+                property["Heating"].split(":")[0]
             )
             ].copy()
 
@@ -1826,23 +1831,47 @@ def propsed_wave_3_sample():
 
         surveyed = survey_results_with_original_features[
             (
-                survey_results_with_original_features["Property Type"] ==
-                property["Property Type"]
+                survey_results_with_original_features["Postal Region"] ==
+                property["Postal Region"]
             ) &
             (
-                survey_results_with_original_features["Wall Type"] ==
-                property["Wall Type"]
+                survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
+                property["Property Type"].split(":")[0]
+            )
+            &
+            (
+                survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+                property["Wall Type"].split(":")[0]
             ) &
             (
                 survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
                 property["Roof Type"].split(":")[0]
             ) &
             (
-                survey_results_with_original_features["Heating"] ==
-                property["Heating"]
+                survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+                property["Heating"].split(":")[0]
             )
             ].copy()
 
+        # surveyed = survey_results_with_original_features[
+        #     (
+        #         survey_results_with_original_features["Property Type"] ==
+        #         property["Property Type"]
+        #     ) &
+        #     (
+        #         survey_results_with_original_features["Wall Type"] ==
+        #         property["Wall Type"]
+        #     ) &
+        #     (
+        #         survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+        #         property["Roof Type"].split(":")[0]
+        #     ) &
+        #     (
+        #         survey_results_with_original_features["Heating"] ==
+        #         property["Heating"]
+        #     )
+        #     ].copy()
+
         if not surveyed.empty:
             return surveyed
 
@@ -1906,7 +1935,12 @@ def propsed_wave_3_sample():
             on="Address ID",
             how="left"
         )
-        region_assets['Distance to Closest Match (m)'] = 0
+        region_assets['Distance to Closest Match (m)'] = None
+        region_assets["Distance to Closest Match (m)"] = np.where(
+            ~pd.isnull(region_assets["Current EPC Band"]),
+            0,
+            region_assets["Distance to Closest Match (m)"]
+        )
 
         # Label the tier 1 properties
         region_assets["Confidence Tier"] = None
@@ -2016,7 +2050,7 @@ def propsed_wave_3_sample():
 
         missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"])
 
-        archetype_surveyed = []
+        # archetype_surveyed = []
         for arch_id in missed_archetypes:
             for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
                 archetype_data = survey_results_with_original_features[
@@ -2175,7 +2209,14 @@ def propsed_wave_3_sample():
                     {
                         "Address ID": a_id,
                         "Confidence Tier": "4 - no similar property, needs survey to confirm",
-                        "Current EPC Band": "Needs Survey"
+                        "Current EPC Band": "Needs Survey",
+                        "Current SAP Rating": "Needs Survey",
+                        'Survey: Main Wall Type': "Not Surveyed",
+                        "Survey: Main Alternative Wall": "Not Surveyed",
+                        "Survey: Main Roof Type": "Not Surveyed",
+                        "Survey: Primary Heating System": "Not Surveyed",
+                        "Survey: Matching Address ID": "Not Surveyed",
+                        'Distance to Closest Match (m)': 9999999,
                     }
                 )
                 continue
@@ -2197,18 +2238,6 @@ def propsed_wave_3_sample():
             # Take the 3 nearest
             surveyed = surveyed.head(3)
 
-            # # We allow a max distance of 10km
-            # surveyed = surveyed[surveyed["distance_meters"] < 10000]
-            # if surveyed.empty:
-            #     final_missed_matches.append(
-            #         {
-            #             "Address ID": a_id,
-            #             "Confidence Tier": "4 - no similar property, needs survey to confirm",
-            #             "Current EPC Band": "Needs Survey"
-            #         }
-            #     )
-            #     continue
-
             # perform a weighted mean of SAP rating - the closer the better
             expected_sap = np.average(
                 surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1)
@@ -2218,129 +2247,24 @@ def propsed_wave_3_sample():
             if expected_epc in ["C", "B", "A"]:
                 match_type = "5 - EPC C or above"
 
+            closest_match = surveyed.iloc[0]
+
             final_missed_matches.append(
                 {
                     "Address ID": a_id,
                     "Confidence Tier": match_type,
-                    "Current EPC Band": expected_epc
+                    "Current EPC Band": expected_epc,
+                    "Current SAP Rating": expected_sap,
+                    'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
+                    "Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"],
+                    "Survey: Main Roof Type": closest_match["Survey: Main Roof Type"],
+                    "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"],
+                    "Survey: Matching Address ID": closest_match["Address ID"],
+                    'Distance to Closest Match (m)': closest_match["distance_meters"],
                 }
             )
             continue
 
-            # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
-            #     filter_property_types = ["House", "Bungalow"]
-            # else:
-            #     filter_property_types = ["Flat"]
-            #
-            # surveyed_similar = survey_results_with_original_features[
-            #     (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
-            #     (
-            #         survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-            #             filter_property_types
-            #         )
-            #     ) &
-            #     (
-            #         survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
-            #         property["Wall Type"].split(":")[0]
-            #     ) &
-            #     (
-            #         survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
-            #         property["Roof Type"].split(":")[0]
-            #     ) &
-            #     (
-            #         survey_results_with_original_features["Heating"].str.split(":").str[0] ==
-            #         property["Heating"].split(":")[0]
-            #     )
-            #     ]
-            # if surveyed_similar.empty:
-            #     surveyed_similar = survey_results_with_original_features[
-            #         (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-            #         (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-            #             filter_property_types
-            #         )) &
-            #         (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
-            #          property["Wall Type"].split(":")[0]) &
-            #         (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
-            #          property["Roof Type"].split(":")[0]) &
-            #         (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
-            #          property["Heating"].split(":")[0])
-            #         ]
-            #
-            # if surveyed_similar.empty:
-            #
-            #     # We get an average based on the postcode
-            #     surveyed_similar = survey_results_with_original_features[
-            #         (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
-            #         (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
-            #             filter_property_types
-            #         ))
-            #         ]
-            #     if surveyed_similar.empty:
-            #         surveyed_similar_entire_population = survey_results_with_original_features[
-            #             (
-            #                 survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[
-            #                 "Property Type"].split(":")[0]
-            #             ) &
-            #             (
-            #                 survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
-            #                 property["Wall Type"].split(":")[0]
-            #             ) &
-            #             (
-            #                 survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
-            #                 property["Roof Type"].split(":")[0]
-            #             ) &
-            #             (
-            #                 survey_results_with_original_features["Heating"].str.split(":").str[0] ==
-            #                 property["Heating"].split(":")[0]
-            #             )
-            #             ]
-            #
-            #         # We order them by distance on postcode
-            #
-            #         # Average
-            #         expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean()
-            #         expected_epc = sap_to_epc(expected_sap)
-            #
-            #         final_missed_matches.append(
-            #             {
-            #                 "Address ID": a_id,
-            #                 "Confidence Tier": "3 - similar property, all areas searched",
-            #                 "Current EPC Band": expected_epc
-            #             }
-            #
-            #         )
-            #     else:
-            #         expected_sap = surveyed_similar["Current SAP Rating"].mean()
-            #         expected_epc = sap_to_epc(expected_sap)
-            #         if expected_epc in ["C", "B", "A"]:
-            #             tier = "5 - EPC C or above"
-            #         else:
-            #             tier = "3 - similar property, relaxed conditions"
-            #
-            #         final_missed_matches.append(
-            #             {
-            #                 "Address ID": a_id,
-            #                 "Confidence Tier": tier,
-            #                 "Current EPC Band": expected_epc
-            #             }
-            #         )
-            #     continue
-            # # We take an average
-            # expected_sap = surveyed_similar["Current SAP Rating"].mean()
-            # expected_epc = sap_to_epc(expected_sap)
-            # if expected_epc in ["C", "B", "A"]:
-            #     tier = "5 - EPC C or above"
-            # else:
-            #     tier = "3 - similar property"
-            #
-            # final_missed_matches.append(
-            #     {
-            #         "Address ID": a_id,
-            #         "Confidence Tier": tier,
-            #         "Current EPC Band": expected_epc
-            #     }
-            # )
-
         final_missed_matches = pd.DataFrame(final_missed_matches)
 
         region_assets = region_assets.merge(
@@ -2353,12 +2277,11 @@ def propsed_wave_3_sample():
         region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna(
             region_assets["Confidence Tier_method3"]
         )
-        region_assets["Current EPC Band"] = np.where(
-            pd.isnull(region_assets["Current EPC Band"]),
-            region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"]
-        )
 
-        region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"])
+        region_assets = fill_survey_columns(region_assets, suffix="_method3")
+
+        method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")]
+        region_assets = region_assets.drop(columns=method_3_columns)
 
         if pd.isnull(region_assets["Current EPC Band"]).sum():
             raise Exception("Something went wrong")
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
index ac0e8235..b6394275 100644
--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -289,6 +289,12 @@ class RetrieveFindMyEpc:
             "Fuel change recommendation": [],
             "PV Cells recommendation": [],
             "Replacement glazing units": ["double_glazing"],
+            "Heating controls (time and temperature zone control)": ["time_temperature_zone_control"],
+            "High heat retention storage heaters": ["high_heat_retention_storage_heaters"],
+            "Gas condensing boiler": ["boiler_upgrade"],
+            "Change room heaters to condensing boiler": ["boiler_upgrade"],
+            "Cylinder thermostat": ["cylinder_thermostat"],
+            "Heat recovery system for mixer showers": ["heat_recovery_shower"],
         }
 
         survey = True
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index d9f6bf43..6f9dd135 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -150,6 +150,13 @@ def app():
         # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
         asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
 
+    # We check for duplicated addresses
+    asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
+    if asset_list["deduper"].duplicated().sum():
+        # Drop the dupes
+        print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
+        asset_list = asset_list[~asset_list["deduper"].duplicated()]
+
     epc_data, errors, no_epc = get_data(
         asset_list=asset_list,
         fulladdress_column=FULLADDRESS_COLUMN,

From d65c99f62a0fd7cb6e1c58a5816db0e4e4477fb5 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 19 Nov 2024 08:41:44 +0000
Subject: [PATCH 26/31] tidying up optimisation process

---
 .../stonewater/Wave 3 Preparation.py          | 105 ++++--------------
 1 file changed, 24 insertions(+), 81 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 744b3400..c8e61a0e 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2297,39 +2297,9 @@ def propsed_wave_3_sample():
         if pd.isnull(results[c]).sum():
             raise Exception("Something went wrong")
 
-    # home = results[results["Confidence Tier"] == "5 - EPC C or above"].sample(1)
-    # region = home["Postal Region"].values[0]
-
-    # Create a pivot table for counts of Confidence Tier by Postal Region
-    geographic_summary = results.pivot_table(
-        index='Postal Region',
-        columns='Confidence Tier',
-        aggfunc='size',
-        fill_value=0
-    ).reset_index()
-
-    # We create the gain and loss columns
-    # Gain is the sum of these columns:
-    # '1 - Archetype surveyed',
-    # '1 - property was surveyed',
-    # '2 - same archetype',
-    # '3 - similar property, weighted on distance'
-
     gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x])
     loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x])
 
-    geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1)
-    geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1)
-
-    print(geographic_summary.sum())
-
-    geographic_summary = geographic_summary.sort_values("Loss", ascending=True)
-    geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
-    geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()
-
-    loss = geographic_summary["Loss"].values
-    gain = geographic_summary["Gain"].values
-
     def optimise(gain, loss, max_loss=250):
 
         # Define the coefficients for the objective function (negative because we maximize Gain)
@@ -2352,76 +2322,51 @@ def propsed_wave_3_sample():
 
         return selected_rows, optimal_gain
 
-    selected_rows, _ = optimise(gain, loss, 250)
-
-    # Select the rows that are selected
-    geographic_summary["Selected"] = selected_rows == 1
-    geographic_summary[geographic_summary["Selected"]].sum()
-
-    region_totals = geographic_summary[
-        geographic_summary["Selected"]
-    ][["Gain", "Loss"]].sum()
-
-    # We now see if there are any postcodes that have no loss that can be added
-    unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values
-
-    # TODO: Try on street
-
-    postcode_summary = results.pivot_table(
+    street_summary = results.pivot_table(
         index='Street and Region',
         columns='Confidence Tier',
         aggfunc='size',
         fill_value=0
     ).reset_index()
-    # postcode_summary = postcode_summary.merge(
-    #     results[["Postcode", "Postal Region"]].drop_duplicates(),
-    #     how="left", on="Postcode"
-    # )
-    #
-    postcode_summary_unselected_regions = postcode_summary.copy()
-    # postcode_summary_unselected_regions = postcode_summary[
-    #     postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions)
-    # ].copy()
 
-    postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1)
-    postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1)
+    street_summary["Gain"] = street_summary[gain_columns].sum(axis=1)
+    street_summary["Loss"] = street_summary[loss_columns].sum(axis=1)
 
-    # Remaining loss allowed
-    # remaining_loss_constraint = 230 - region_totals["Loss"]
-    remaining_loss_constraint = 220
-    postcode_selected_rows, _ = optimise(
-        gain=postcode_summary_unselected_regions["Gain"].values,
-        loss=postcode_summary_unselected_regions["Loss"].values,
-        max_loss=int(remaining_loss_constraint)
+    print(street_summary.sum())
+
+    selected_rows, _ = optimise(
+        gain=street_summary["Gain"].values,
+        loss=street_summary["Loss"].values,
+        max_loss=250
     )
 
-    postcode_summary_unselected_regions["Selected"] = postcode_selected_rows == 1
-    postcode_summary_unselected_regions[postcode_summary_unselected_regions["Selected"]][["Gain", "Loss"]].sum()
+    street_summary["Selected"] = selected_rows == 1
+    print(street_summary[street_summary["Selected"]][["Gain", "Loss"]].sum())
 
-    postcode_optimised_additional_properties = postcode_summary_unselected_regions[
-        postcode_summary_unselected_regions["Selected"]
+    selected_streets = street_summary[
+        street_summary["Selected"]
     ]
 
-    postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum()
+    totals = selected_streets[["Gain", "Loss"]].sum()
 
-    bid_size = postcode_totals.sum()
+    bid_size = totals.sum()
     print("Bid Size:", bid_size)
-    total_epc_d_or_below = postcode_totals["Gain"]
+    total_epc_d_or_below = totals["Gain"]
     print("Total EPC D or below:", total_epc_d_or_below)
-    total_epc_c = postcode_totals["Loss"]
+    total_epc_c = totals["Loss"]
     print("Total EPC C or above:", total_epc_c)
     # Total needing a survey
-    total_needing_survey = postcode_optimised_additional_properties[
+    total_needing_survey = selected_streets[
         "4 - no similar property, needs survey to confirm"
     ].sum()
     print("Total needing survey:", total_needing_survey)
 
     # Look for postcodes that have no loss
-    unselected_streets = postcode_summary_unselected_regions[
-        ~postcode_summary_unselected_regions["Selected"]
+    unselected_streets = street_summary[
+        ~street_summary["Selected"]
     ]["Street and Region"].values
 
-    postcode_summary2 = results[
+    postcode_summary = results[
         results["Street and Region"].isin(unselected_streets)
     ].pivot_table(
         index='Postcode',
@@ -2430,14 +2375,12 @@ def propsed_wave_3_sample():
         fill_value=0
     ).reset_index()
 
-    postcode_summary2["Gain"] = postcode_summary2[gain_columns].sum(axis=1)
-    postcode_summary2["Loss"] = postcode_summary2[loss_columns].sum(axis=1)
+    postcode_summary["Gain"] = postcode_summary[gain_columns].sum(axis=1)
+    postcode_summary["Loss"] = postcode_summary[loss_columns].sum(axis=1)
 
-    no_loss_postcodes = postcode_summary2[postcode_summary2["Loss"] == 0].sort_values("Gain", ascending=False)
+    no_loss_postcodes = postcode_summary[postcode_summary["Loss"] == 0].sort_values("Gain", ascending=False)
     total_bid_size = bid_size + no_loss_postcodes["Gain"].sum()
     print(total_bid_size)
 
-    z = results[results["Confidence Tier"] == "5 - EPC C or above"]
-
 # if __name__ == "__main__":
 #     main()

From d163ca99315b2e2c82b95ab629041351374fb081 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 19 Nov 2024 13:54:46 +0000
Subject: [PATCH 27/31] fixing filling of property

---
 .../stonewater/Wave 3 Preparation.py          | 188 +++++++++---------
 1 file changed, 98 insertions(+), 90 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index c8e61a0e..426097e8 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1669,7 +1669,7 @@ def propsed_wave_3_sample():
         header=4
     )
 
-    # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
+    # TODO: We drop 7 properties missing
     # UPRN
     asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])]
     # Clean address ids
@@ -1699,15 +1699,23 @@ def propsed_wave_3_sample():
         os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"),
         header=0
     )
-    survey_results = survey_results.merge(
+
+    survey_results = survey_results.drop(
+        columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]
+    ).merge(
         additional_survey_data[
             [
                 "Address ID",
                 "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness",
                 "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
-                "Main Building Alternative Wall Thickness"
+                "Main Building Alternative Wall Thickness",
+                "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"
             ]
-        ].rename(columns={"Main Wall Insulation_x": "Main Wall Insulation Type"}),
+        ].rename(
+            columns={
+                "Main Wall Insulation_x": "Main Wall Insulation Type",
+            }
+        ),
         how="left",
         on="Address ID"
     )
@@ -1718,6 +1726,7 @@ def propsed_wave_3_sample():
             "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode",
             "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness",
             "Existing Primary Heating System",
+            "Package Ref",
             "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness",
             "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
             "Main Building Alternative Wall Thickness"
@@ -1727,6 +1736,7 @@ def propsed_wave_3_sample():
             "Existing Primary Heating System": "Survey: Primary Heating System"
         }
     )
+
     survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
     # Concatenate from the wall information
     survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
@@ -1929,7 +1939,7 @@ def propsed_wave_3_sample():
         region_assets = region_assets.merge(
             exact_surveyed[
                 ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [
-                    "Survey: Matching Address ID"
+                    "Survey: Matching Address ID", "Package Ref"
                 ]
                 ],
             on="Address ID",
@@ -2005,6 +2015,7 @@ def propsed_wave_3_sample():
                         'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
                         "Survey: Matching Address ID": closest_match["Address ID"],
                         'Distance to Closest Match (m)': closest_match["distance_meters"],
+                        "Package Ref": closest_match["Package Ref"],
                         "Match Type": match_type
                     }
                 )
@@ -2015,7 +2026,8 @@ def propsed_wave_3_sample():
                 columns=[
                     "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
                     'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
-                    'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
+                    'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)',
+                    "Match Type"
                 ]
             )
 
@@ -2032,8 +2044,8 @@ def propsed_wave_3_sample():
         # Label the tier 1 properties
         region_assets["Confidence Tier"] = np.where(
             region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
-            pd.isnull(region_assets["Confidence Tier"]),
-            "1 - Archetype surveyed in region", region_assets["Confidence Tier"]
+            pd.isnull(region_assets["Confidence Tier"]) & ~pd.isnull(region_assets["Match Type"]),
+            region_assets["Match Type"], region_assets["Confidence Tier"]
         )
 
         # Handle EPC C
@@ -2046,86 +2058,7 @@ def propsed_wave_3_sample():
         region_assets = fill_survey_columns(region_assets, suffix="_method1")
 
         method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")]
-        region_assets = region_assets.drop(columns=method_1_columns)
-
-        missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"])
-
-        # archetype_surveyed = []
-        for arch_id in missed_archetypes:
-            for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
-                archetype_data = survey_results_with_original_features[
-                    survey_results["Archetype ID"] == arch_id
-                    ].copy()
-                if archetype_data.empty:
-                    continue
-                raise Exception("IMPLEMENT ME")
-        #         archetype_data["distance_meters"] = haversine(
-        #             lat1=property.latitude, lon1=property.longitude,
-        #             lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
-        #         )
-        #         expected_sap = np.average(
-        #             archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
-        #         )
-        #         expected_epc = sap_to_epc(expected_sap)
-        #         archetype_surveyed.append(
-        #             {
-        #                 "Archetype ID": arch_id,
-        #                 "Address ID": property["Address ID"],
-        #                 "Current EPC Band": expected_epc
-        #             }
-        #         )
-        # archetype_surveyed = pd.DataFrame(archetype_surveyed)
-        # if archetype_surveyed.empty:
-        #     archetype_surveyed = pd.DataFrame(
-        #         columns=[
-        #             "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
-        #             'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
-        #             'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
-        #         ]
-        #     )
-        #
-        # region_assets = region_assets.merge(
-        #     archetype_surveyed,
-        #     on=["Archetype ID", "Address ID"],
-        #     how="left",
-        #     suffixes=("", "_method2")
-        # )
-        #
-        # region_assets["Confidence Tier"] = np.where(
-        #     region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
-        #         region_assets["Confidence Tier"]),
-        #     "2 - same archetype", region_assets["Confidence Tier"]
-        # )
-        #
-        # for col in [
-        #     'Current EPC Band', 'Current SAP Rating',
-        #     'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
-        #     'Survey: Main Roof Type', 'Survey: Primary Heating System',
-        #     'Survey: Matching Address ID', 'Distance to Closest Match (m)'
-        # ]:
-        #     region_assets[col] = np.where(
-        #         pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]),
-        #         region_assets[col + "_method2"], region_assets[col]
-        #     )
-        #
-        # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")]
-        # region_assets = region_assets.drop(columns=method_2_columns)
-
-        # We label EPC C properties
-        # region_assets["Confidence Tier"] = np.where(
-        #     region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
-        #     "5 - EPC C or above", region_assets["Confidence Tier"]
-        # )
-        #
-        # region_assets["Confidence Tier"] = np.where(
-        #     region_assets["Archetype ID"] == "EPC C OR ABOVE",
-        #     "5 - EPC C or above", region_assets["Confidence Tier"]
-        # )
-        #
-        # region_assets["Current EPC Band"] = np.where(
-        #     region_assets["Archetype ID"] == "EPC C OR ABOVE",
-        #     "C", region_assets["Current EPC Band"]
-        # )
+        region_assets = region_assets.drop(columns=method_1_columns + ["Match Type"])
 
         missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
 
@@ -2217,6 +2150,7 @@ def propsed_wave_3_sample():
                         "Survey: Primary Heating System": "Not Surveyed",
                         "Survey: Matching Address ID": "Not Surveyed",
                         'Distance to Closest Match (m)': 9999999,
+                        "Package Ref": "Not Surveyed",
                     }
                 )
                 continue
@@ -2261,6 +2195,7 @@ def propsed_wave_3_sample():
                     "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"],
                     "Survey: Matching Address ID": closest_match["Address ID"],
                     'Distance to Closest Match (m)': closest_match["distance_meters"],
+                    "Package Ref": closest_match["Package Ref"]
                 }
             )
             continue
@@ -2292,8 +2227,10 @@ def propsed_wave_3_sample():
 
     # Check if there are missings in current epc band, current sap rating or any of the survey attributes
     for c in (
-        ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
-        survey_attribute_columns):
+        [
+            "Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
+        survey_attribute_columns
+    ):
         if pd.isnull(results[c]).sum():
             raise Exception("Something went wrong")
 
@@ -2382,5 +2319,76 @@ def propsed_wave_3_sample():
     total_bid_size = bid_size + no_loss_postcodes["Gain"].sum()
     print(total_bid_size)
 
+    # Label final outputs
+    # We create a summary of packages by street
+    results["Package Ref"] = results["Package Ref"].fillna("Incomplete")
+    results["Package Ref"] = results["Package Ref"].astype(str)
+    package_summary = results.pivot_table(
+        index='Street and Region',
+        columns='Package Ref',
+        aggfunc='size',
+        fill_value=0
+    ).reset_index()
+
+    street_bid_structure = street_summary.merge(
+        package_summary, how="left", on="Street and Region"
+    )
+    street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False)
+    street_bid_structure.to_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False
+    )
+
+    individual_units_programme = results.copy()
+    individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin(
+        street_bid_structure[street_bid_structure["Selected"]]["Street and Region"].values
+    )
+
+    # Merge on Stonewaters ID
+    asset_list_ids = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+        "- Archetyped V3.1.xlsx",
+        header=4
+    )[["Address ID", "Org. ref."]]
+    # Clean address ids
+    asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])]
+    asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"]
+    asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int)
+    individual_units_programme = individual_units_programme.merge(
+        asset_list_ids,
+        how="left",
+        on="Address ID",
+    )
+
+    individual_units_programme = individual_units_programme.merge(
+        asset_list_ids.rename(
+            columns={"Org. ref.": "Survey: Org. ref.", "Address ID": "Survey: Matching Address ID"}
+        ),
+        how="left",
+        on="Survey: Matching Address ID"
+    )
+
+    individual_units_programme["Survey: Org. ref."] = np.where(
+        (individual_units_programme["Survey: Matching Address ID"] == "Not Surveyed"),
+        "Not Surveyed",
+        individual_units_programme["Survey: Org. ref."]
+    )
+
+    if pd.isnull(individual_units_programme["Survey: Org. ref."]).sum() or pd.isnull(
+        individual_units_programme["Org. ref."]).sum():
+        raise ValueError("something went wrong")
+
+    for col in ["Survey: Main Roof Type", "Survey: Main Wall Type", "Survey: Main Alternative Wall"]:
+        individual_units_programme[col] = (
+            individual_units_programme[col]
+            .str.replace(r': nan(?=$|:)', '', regex=True)  # Remove ': nan' at the end or before another ':'
+            .str.replace(r':\s+:', ': ', regex=True)  # Replace occurrences of ': :' with ': '
+            .str.replace(r'\s+', ' ', regex=True)  # Replace multiple spaces with a single space
+            .str.strip()  # Strip leading/trailing spaces
+        )
+
+    individual_units_programme.to_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False
+    )
+
 # if __name__ == "__main__":
 #     main()

From 1645f9ab9ed84bdb90fa2a732d697111b36bd17b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 19 Nov 2024 22:00:00 +0000
Subject: [PATCH 28/31] updating stonewater modelling code to use new data

---
 .../stonewater/Wave 3 Preparation.py          | 288 +++++++++++++++---
 1 file changed, 247 insertions(+), 41 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 426097e8..f4195592 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -1071,10 +1071,13 @@ def main():
     ]
 
     # We now merge on the coordinator data so that against each property, we can map the measures
+    # TODO: Get the pre & post primary energy numbers
+    # TODO: Make sure the numbers are going down
+
     retrofit_packages_board = pd.read_excel(
         os.path.join(
             CUSTOMER_FOLDER_PATH,
-            "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1731315080 11.11.24.xlsx"
+            "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx"
         ),
         header=4
     )
@@ -1084,6 +1087,18 @@ def main():
         retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
     ]
 
+    # populated_primary_energy = retrofit_packages_board[
+    #     ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)'])
+    # ]
+    #
+    # z = populated_primary_energy[
+    #     populated_primary_energy['POST Primary energy (13a - 272)'] > populated_primary_energy[
+    #         'BASE Primary energy (13a-272)']
+    #     ]
+    #
+    # all(populated_primary_energy['POST Primary energy (13a - 272)'] <= populated_primary_energy[
+    #     'BASE Primary energy (13a-272)'])
+
     # Replace \n with ""
     extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "")
 
@@ -1192,7 +1207,7 @@ def main():
         # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv(
         #     CUSTOMER_FOLDER_PATH + "/missed_debugging.csv")
 
-        if len(missing_ids) != 6:
+        if len(missing_ids) != 1:
             raise Exception("Unacceptable number of missings")
 
     if matching_lookup["Address ID"].duplicated().sum():
@@ -1239,7 +1254,6 @@ def main():
 
     if stonewater_data["Address ID"].duplicated().sum():
         raise Exception("Duplicate Address IDs")
-
     # Create a section for costs
     for measure in measure_columns:
         stonewater_data[f"Cost of {measure}"] = None
@@ -1297,8 +1311,41 @@ def main():
     ]:
         stonewater_data[c] = stonewater_data[c].astype(str)
 
+    # FIll the primary energy numbers from the excel
+    stonewater_data = stonewater_data.merge(
+        retrofit_packages_board[
+            [
+                "Name", "Address ID", "BASE Primary energy (13a-272)", "POST Primary energy (13a - 272)"
+            ]
+        ],
+        on=["Address ID", "Name"],
+        how="left"
+    )
+    stonewater_data["Primary Energy Use (kWh/yr)"] = np.where(
+        pd.isnull(stonewater_data["Primary Energy Use (kWh/yr)"]),
+        stonewater_data["BASE Primary energy (13a-272)"],
+        stonewater_data["Primary Energy Use (kWh/yr)"]
+    )
+    stonewater_data = stonewater_data.drop(columns=["BASE Primary energy (13a-272)"])
+
+    # Add on organisation reference
+    original_archetypes = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+        "- Archetyped V3.1.xlsx",
+        header=4
+    )
+    original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])]
+    original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
+    original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
+
+    stonewater_data = stonewater_data.merge(
+        original_archetypes[["Address ID", 'Org. ref.']],
+        on="Address ID",
+        how="left"
+    )
+
     # Save this data to excel
-    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False)
+    stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V4.xlsx", index=False)
 
     cost_sheet = [
         {
@@ -1677,6 +1724,12 @@ def propsed_wave_3_sample():
     asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
     asset_list["Address ID"] = asset_list["Address ID"].astype(int)
 
+    asset_list["Street name"] = np.where(
+        pd.isnull(asset_list["Street name"]),
+        asset_list["Postcode"],
+        asset_list["Street name"]
+    )
+
     # Create the postal region, taking the first part of the postcode
     asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0]
     asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"]
@@ -1684,43 +1737,16 @@ def propsed_wave_3_sample():
 
     # Keep just the columns we need
     asset_list = asset_list[
-        ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Street and Region",
+        ["UPRN", "Address ID", 'Org. ref.', "Archetype ID", "Postal Region", "Name", "Postcode", "Street and Region",
          "Property Type", "Wall Type", "Roof Type", "Heating"]
     ]
 
-    # Updated packages: to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V3.xlsx", index=False)
     survey_results = pd.read_excel(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"),
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"),
         header=13,
         sheet_name="Modelled Packages"
     )
 
-    additional_survey_data = pd.read_excel(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - costed retrofit packages V3.xlsx"),
-        header=0
-    )
-
-    survey_results = survey_results.drop(
-        columns=["Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]
-    ).merge(
-        additional_survey_data[
-            [
-                "Address ID",
-                "Main Wall Type", "Main Wall Insulation_x", "Main Wall Thickness",
-                "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
-                "Main Building Alternative Wall Thickness",
-                "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"
-            ]
-        ].rename(
-            columns={
-                "Main Wall Insulation_x": "Main Wall Insulation Type",
-            }
-        ),
-        how="left",
-        on="Address ID"
-    )
-
-    # TOOD: We probably want the actual surveyed wall, roof, heating type
     survey_results = survey_results[
         [
             "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode",
@@ -1768,6 +1794,105 @@ def propsed_wave_3_sample():
     if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
         raise ValueError("Something went wrong")
 
+    # Against properties that have NO package ref, we assign a package ref
+    properties_with_packages = survey_results_with_original_features[
+        ~pd.isnull(survey_results_with_original_features["Package Ref"])
+    ]
+
+    properties_without_packages = survey_results_with_original_features[
+        (survey_results_with_original_features["Current SAP Rating"] < 69) & pd.isnull(
+            survey_results_with_original_features["Package Ref"]
+        )
+        ]
+
+    # Change this to a lookup
+    package_ratings = pd.DataFrame([
+        {
+            "1A": 1,
+            "1B": 2,
+            "2A": 3,
+            "2B": 4,
+            "3A": 5,
+            "3B": 6,
+            4: 7
+        }
+    ])
+    package_ratings = pd.melt(package_ratings, var_name="Package Ref", value_name="Rank")
+
+    mapped_package_refs = []
+    for _, property in tqdm(properties_without_packages.iterrows(), total=len(properties_without_packages)):
+        # Same archetype?
+        matches = properties_with_packages[properties_with_packages["Archetype ID"] == property["Archetype ID"]]
+
+        if matches.empty:
+            # Similar property
+            matches = properties_with_packages[
+                (properties_with_packages["Property Type"].str.split(":").str[0] ==
+                 property["Property Type"].split(":")[0]) &
+                (properties_with_packages["Wall Type"] == property["Wall Type"]) &
+                (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) &
+                (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0])
+                ]
+        if matches.empty:
+            matches = properties_with_packages[
+                (properties_with_packages["Property Type"].str.split(":").str[0] ==
+                 property["Property Type"].split(":")[0]) &
+                (properties_with_packages["Wall Type"].str.split(":").str[0] == property["Wall Type"].split(":")[0]) &
+                (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) &
+                (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0])
+                ]
+        if matches.empty:
+            raise Exception("Implement me")
+        if matches.shape[0] > 1:
+            # Take the package with the highest rank
+            matches = matches.merge(
+                package_ratings,
+                on="Package Ref",
+                how="left"
+            ).sort_values("Rank", ascending=False).head(1)
+
+        mapped_package_refs.append(
+            {
+                "Address ID": property["Address ID"],
+                "Matched Package Ref": matches["Package Ref"].values[0]
+            }
+        )
+
+    mapped_package_refs = pd.DataFrame(mapped_package_refs)
+
+    survey_results = survey_results.merge(
+        mapped_package_refs,
+        on="Address ID",
+        how="left"
+    )
+    survey_results["Package Ref"] = np.where(
+        pd.notnull(survey_results["Matched Package Ref"]),
+        survey_results["Matched Package Ref"],
+        survey_results["Package Ref"]
+    )
+    survey_results = survey_results.drop(columns=["Matched Package Ref"])
+
+    # Do the same with survey_results_with_original_features
+    survey_results_with_original_features = survey_results_with_original_features.merge(
+        mapped_package_refs,
+        on="Address ID",
+        how="left"
+    )
+    survey_results_with_original_features["Package Ref"] = np.where(
+        pd.notnull(survey_results_with_original_features["Matched Package Ref"]),
+        survey_results_with_original_features["Matched Package Ref"],
+        survey_results_with_original_features["Package Ref"]
+    )
+    survey_results_with_original_features = survey_results_with_original_features.drop(columns=["Matched Package Ref"])
+
+    # Save the data for reference
+    # mapped_package_refs = mapped_package_refs.merge(
+    #     asset_list[["Name", "Postcode", "Address ID", "Org. ref."]],
+    #     on="Address ID",
+    #     how="left"
+    # )
+    # mapped_package_refs.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "mapped_package_refs.csv"), index=False)
+
     # We get longitude & Latitude
     archetyping_spatial_features = read_pickle_from_s3(
         bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
@@ -1911,7 +2036,8 @@ def propsed_wave_3_sample():
             'Current EPC Band', 'Current SAP Rating',
             'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
             'Survey: Main Roof Type', 'Survey: Primary Heating System',
-            'Survey: Matching Address ID', 'Distance to Closest Match (m)'
+            'Survey: Matching Address ID', 'Distance to Closest Match (m)',
+            "Package Ref"
         ]:
             region_assets[col] = np.where(
                 pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]),
@@ -2027,7 +2153,7 @@ def propsed_wave_3_sample():
                     "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
                     'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
                     'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)',
-                    "Match Type"
+                    "Match Type", "Package Ref"
                 ]
             )
 
@@ -2183,6 +2309,13 @@ def propsed_wave_3_sample():
 
             closest_match = surveyed.iloc[0]
 
+            # The closest property may be an EPC C, we we take the package ref from the property that's the nearest
+            # with non-NA package ref
+            if expected_epc in ["C", "B", "A"]:
+                package_ref = None
+            else:
+                package_ref = surveyed["Package Ref"].dropna().values[0]
+
             final_missed_matches.append(
                 {
                     "Address ID": a_id,
@@ -2195,7 +2328,7 @@ def propsed_wave_3_sample():
                     "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"],
                     "Survey: Matching Address ID": closest_match["Address ID"],
                     'Distance to Closest Match (m)': closest_match["distance_meters"],
-                    "Package Ref": closest_match["Package Ref"]
+                    "Package Ref": package_ref
                 }
             )
             continue
@@ -2225,6 +2358,11 @@ def propsed_wave_3_sample():
 
     results = pd.concat(results)
 
+    results[
+        pd.isnull(results["Package Ref"]) & (results["Current EPC Band"] == "D")
+        ]["Postal Region"]
+    results[resul]
+
     # Check if there are missings in current epc band, current sap rating or any of the survey attributes
     for c in (
         [
@@ -2269,8 +2407,6 @@ def propsed_wave_3_sample():
     street_summary["Gain"] = street_summary[gain_columns].sum(axis=1)
     street_summary["Loss"] = street_summary[loss_columns].sum(axis=1)
 
-    print(street_summary.sum())
-
     selected_rows, _ = optimise(
         gain=street_summary["Gain"].values,
         loss=street_summary["Loss"].values,
@@ -2334,9 +2470,6 @@ def propsed_wave_3_sample():
         package_summary, how="left", on="Street and Region"
     )
     street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False)
-    street_bid_structure.to_csv(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False
-    )
 
     individual_units_programme = results.copy()
     individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin(
@@ -2386,6 +2519,79 @@ def propsed_wave_3_sample():
             .str.strip()  # Strip leading/trailing spaces
         )
 
+    # Any EPC C properties that have been included should be flagged as potential low carbon heating
+    selected_epc_c = individual_units_programme[
+        (individual_units_programme["Current EPC Band"].isin(["C", "B", "A", "Needs Survey"])) &
+        (individual_units_programme["Unit in Programme"])
+        ]
+
+    flat_wall_map = {
+        "CA Cavity: F Filled Cavity": False,
+        "CA Cavity: A As Built": True,
+        "SO Solid Brick: A As Built": True,
+        "Not Surveyed": False
+    }
+
+    heating_map = {
+        "BGW Post 98 Combi condens. with auto ign.": False,
+        "BGB Post 98 Regular condens. with auto ign.": False,
+        "SEK High heat retention storage heaters": False,
+        "SEB Modern slimline storage heaters": True,
+        "Not Surveyed": False
+    }
+
+    infill_data = []
+    for _, epc_c_property in selected_epc_c.iterrows():
+        if epc_c_property["Property Type"].split(":")[0] == "Flat":
+            # Look for a wall insulation measure
+            infill = flat_wall_map[epc_c_property["Survey: Main Wall Type"]]
+            infill_data.append(
+                {
+                    "Address ID": epc_c_property["Address ID"],
+                    "Street and Region": epc_c_property["Street and Region"],
+                    "Possible Flat Infill?": infill
+                }
+            )
+            continue
+
+        infill = heating_map[epc_c_property["Survey: Primary Heating System"]]
+        infill_data.append(
+            {
+                "Address ID": epc_c_property["Address ID"],
+                "Street and Region": epc_c_property["Street and Region"],
+                "Low Carbon Heating Infill?": infill
+            }
+        )
+    infill_data = pd.DataFrame(infill_data)
+
+    individual_units_programme = individual_units_programme.merge(
+        infill_data[["Address ID", 'Possible Flat Infill?', 'Low Carbon Heating Infill?']],
+        how="left", on="Address ID"
+    )
+
+    for c in ['Possible Flat Infill?', 'Low Carbon Heating Infill?']:
+        individual_units_programme[c] = individual_units_programme[c].fillna(False)
+
+    infill_by_street = infill_data.pivot_table(
+        index='Street and Region',
+        values=['Possible Flat Infill?', 'Low Carbon Heating Infill?'],
+        aggfunc='sum',
+        fill_value=0
+    ).reset_index()
+
+    street_bid_structure = street_bid_structure.merge(
+        infill_by_street, how="left", on="Street and Region"
+    )
+
+    for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']:
+        street_bid_structure[c] = street_bid_structure[c].fillna(0)
+
+    street_bid_structure.to_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False
+    )
+
+    # TODO: Add the full Address!!!
+
     individual_units_programme.to_csv(
         os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False
     )

From 9057b3d4da71f3dd63a8ae2924a073f6cc168dc8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 19 Nov 2024 22:04:19 +0000
Subject: [PATCH 29/31] fixing assignment of package ref

---
 etl/customers/stonewater/Wave 3 Preparation.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index f4195592..4a841f61 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2126,8 +2126,16 @@ def propsed_wave_3_sample():
                 )
                 expected_epc = sap_to_epc(expected_sap)
 
+                archetype_data = archetype_data.sort_values("distance_meters", ascending=True)
+
                 # We take the features of the closest matching property
-                closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0]
+                closest_match = archetype_data.iloc[0]
+
+                # Set the package ref
+                if expected_epc in ["C", "B", "A"]:
+                    package_ref = None
+                else:
+                    package_ref = archetype_data["Package Ref"].dropna().values[0]
 
                 region_surveyed.append(
                     {
@@ -2141,7 +2149,7 @@ def propsed_wave_3_sample():
                         'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
                         "Survey: Matching Address ID": closest_match["Address ID"],
                         'Distance to Closest Match (m)': closest_match["distance_meters"],
-                        "Package Ref": closest_match["Package Ref"],
+                        "Package Ref": package_ref,
                         "Match Type": match_type
                     }
                 )

From 0fafb03deebca4833680594b989b8362386257be Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 19 Nov 2024 22:06:51 +0000
Subject: [PATCH 30/31] tidying up code

---
 .../stonewater/Wave 3 Preparation.py          | 27 ++-----------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 4a841f61..34ab778a 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2366,10 +2366,8 @@ def propsed_wave_3_sample():
 
     results = pd.concat(results)
 
-    results[
-        pd.isnull(results["Package Ref"]) & (results["Current EPC Band"] == "D")
-        ]["Postal Region"]
-    results[resul]
+    if (pd.isnull(results["Package Ref"]) & (~results["Current EPC Band"].isin(["A", "B", "C"]))).sum():
+        raise ValueError("Missing Package Refs")
 
     # Check if there are missings in current epc band, current sap rating or any of the survey attributes
     for c in (
@@ -2442,27 +2440,6 @@ def propsed_wave_3_sample():
     ].sum()
     print("Total needing survey:", total_needing_survey)
 
-    # Look for postcodes that have no loss
-    unselected_streets = street_summary[
-        ~street_summary["Selected"]
-    ]["Street and Region"].values
-
-    postcode_summary = results[
-        results["Street and Region"].isin(unselected_streets)
-    ].pivot_table(
-        index='Postcode',
-        columns='Confidence Tier',
-        aggfunc='size',
-        fill_value=0
-    ).reset_index()
-
-    postcode_summary["Gain"] = postcode_summary[gain_columns].sum(axis=1)
-    postcode_summary["Loss"] = postcode_summary[loss_columns].sum(axis=1)
-
-    no_loss_postcodes = postcode_summary[postcode_summary["Loss"] == 0].sort_values("Gain", ascending=False)
-    total_bid_size = bid_size + no_loss_postcodes["Gain"].sum()
-    print(total_bid_size)
-
     # Label final outputs
     # We create a summary of packages by street
     results["Package Ref"] = results["Package Ref"].fillna("Incomplete")

From 631a76cb99d213d857c732ea1a58dd9d4291a716 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 21 Nov 2024 11:41:16 +0000
Subject: [PATCH 31/31] stonewater model completed

---
 etl/customers/ksquared/Wave3 Modelling.py     | 35 +++++++++++++++++++
 .../stonewater/Wave 3 Preparation.py          | 32 +++++++++++------
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py
index 96ea2b03..7bfa33b3 100644
--- a/etl/customers/ksquared/Wave3 Modelling.py	
+++ b/etl/customers/ksquared/Wave3 Modelling.py	
@@ -8,6 +8,7 @@ from tqdm import tqdm
 import pandas as pd
 import numpy as np
 from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+from etl.spatial.OpenUprnClient import OpenUprnClient
 from backend.SearchEpc import SearchEpc
 from utils.s3 import save_csv_to_s3
 
@@ -60,6 +61,7 @@ def hornsey():
     }
     extracted_data = []
     asset_list = []
+    hornsey_asset_list["row_id"] = hornsey_asset_list.index
     for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)):
 
         if home["Address letter or number"] == "Flat 1 36 Haringey Park":
@@ -108,12 +110,24 @@ def hornsey():
         asset_list.append(
             {
                 "uprn": newest_epc["uprn"],
+                "row_id": home["row_id"],
                 "address": home["Address letter or number"],
                 "postcode": home["Postcode"],
                 "property_type": "Flat",  # They're all flats
             }
         )
 
+    # Get conservation area data
+    # uprns = [x["uprn"] for x in extracted_data]
+    # conservation_area_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev")
+    #
+    # addresses = pd.DataFrame(asset_list)
+    # addresses["uprn"] = addresses["uprn"].astype(int)
+    # conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN")
+    # conservation_area_df.to_csv(
+    #     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/hornsey_conservation_area_data.csv"
+    # )
+
     # We format the extracted data so that is has the same structure as non-intrusive recommendations
     # We then get the UPRNs and create the asset list
 
@@ -213,6 +227,8 @@ def caha():
         # If pattern doesn't match, return original address
         return address
 
+    caha_asset_list["row_id"] = caha_asset_list.index
+
     extracted_data = []
     asset_list = []
     for _, home in tqdm(caha_asset_list.iterrows(), total=len(caha_asset_list)):
@@ -270,6 +286,7 @@ def caha():
 
         asset_list.append(
             {
+                "row_id": home["row_id"],
                 "uprn": uprn,
                 "address": address,
                 "postcode": home["Postcode"],
@@ -280,6 +297,24 @@ def caha():
             }
         )
 
+    # Missing row ids
+    missed = [r for r in caha_asset_list["row_id"].tolist() if r not in [x["row_id"] for x in asset_list]]
+
+    no_data = [x for x in asset_list if x["uprn"] in [None, ""]]
+    no_data = pd.DataFrame(no_data)
+
+    # Get conservation area data
+    uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]]
+    conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev")
+
+    addresses = pd.DataFrame(asset_list)
+    addresses["uprn"] = addresses["uprn"].astype(str)
+    conservation_area_data["UPRN"] = conservation_area_data["UPRN"].astype(str)
+    conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN")
+    conservation_area_df.to_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_conservation_area_data.csv"
+    )
+
     non_invasive_recommendations = [
         {
             "uprn": r["uprn"],
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 34ab778a..b6c29863 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -729,6 +729,7 @@ def extract_epr(pdf_path):
         "Main Building Alternative Wall Insulation": None,
         "Main Building Alternative Wall Dry-lining": None,
         "Main Building Alternative Wall Thickness": None,
+        "Main Fuel": None
     }
 
     with open(pdf_path, "rb") as file:
@@ -1086,7 +1087,6 @@ def main():
     retrofit_packages_board = retrofit_packages_board[
         retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
     ]
-
     # populated_primary_energy = retrofit_packages_board[
     #     ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)'])
     # ]
@@ -2442,8 +2442,11 @@ def propsed_wave_3_sample():
 
     # Label final outputs
     # We create a summary of packages by street
-    results["Package Ref"] = results["Package Ref"].fillna("Incomplete")
+    results["Package Ref"] = results["Package Ref"].fillna("EPC C - No Package")
     results["Package Ref"] = results["Package Ref"].astype(str)
+    results["Package Ref"] = np.where(
+        results["Package Ref"] == "4.0", "4", results["Package Ref"]
+    )
     package_summary = results.pivot_table(
         index='Street and Region',
         columns='Package Ref',
@@ -2451,6 +2454,8 @@ def propsed_wave_3_sample():
         fill_value=0
     ).reset_index()
 
+    assert sum([v for k, v in package_summary.sum().items() if k != "Street and Region"]) == results.shape[0]
+
     street_bid_structure = street_summary.merge(
         package_summary, how="left", on="Street and Region"
     )
@@ -2471,11 +2476,6 @@ def propsed_wave_3_sample():
     asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])]
     asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"]
     asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int)
-    individual_units_programme = individual_units_programme.merge(
-        asset_list_ids,
-        how="left",
-        on="Address ID",
-    )
 
     individual_units_programme = individual_units_programme.merge(
         asset_list_ids.rename(
@@ -2571,14 +2571,24 @@ def propsed_wave_3_sample():
     for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']:
         street_bid_structure[c] = street_bid_structure[c].fillna(0)
 
-    street_bid_structure.to_csv(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure.csv"), index=False
+    master_sheet = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+        "master "
+        "sheet.csv",
+        encoding='latin1'
+    )
+    master_sheet = master_sheet[["Address ID", "Main Fuel"]]
+
+    individual_units_programme = individual_units_programme.merge(
+        master_sheet, how="left", on="Address ID"
     )
 
-    # TODO: Add the full Address!!!
+    street_bid_structure.to_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure V2.csv"), index=False
+    )
 
     individual_units_programme.to_csv(
-        os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme.csv"), index=False
+        os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False
     )
 
 # if __name__ == "__main__":