From 739afbd79b36ce92c096e6664148199735388228 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 11 Dec 2023 16:17:43 +0000
Subject: [PATCH] finished ha32 analysis

---
 backend/SearchEpc.py            |  26 +--
 etl/eligibility/Eligibility.py  |  37 +++--
 etl/eligibility/ha_15_32/app.py | 281 ++++++++++++++++++++++++--------
 3 files changed, 248 insertions(+), 96 deletions(-)

diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index ff603cae..2a24a780 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -66,7 +66,7 @@ class SearchEpc:
 
     def search(self):
         # Get the EPC data with retries
-        response = {}
+
         for retry in range(self.max_retries):
             try:
                 response = self.client.domestic.search(
@@ -81,14 +81,15 @@ class SearchEpc:
                     print("Failed previous attempt but retry successful")
                 # If we got nothing, final try
                 if not response:
-                    raise NotImplementedError("Implement me")
-                    # response = client.domestic.search(
-                    #     params={"address": " ".join([home["Dwelling num"], home["Street"]]),
-                    #             "postcode": home["Postcode"]}
-                    # )
+                    # TODO: Make a call to OS uprn service and get the address' uprn, just in case there is an
+                    #       issue with how we are searching the api
+
+                    return {
+                        "status": 204,
+                        "message": "no data",
+                        "error": None
+                    }
 
-                    # TODO: Eventually, if we have nothing, we should exit with a 201 or 202, saying that
-                    # there is not data for this property
                 return {
                     "status": 200,
                     "message": "success",
@@ -107,7 +108,7 @@ class SearchEpc:
                         "error": str(e)
                     }
 
-    def retrieve(self):
+    def retrieve(self, property_type=None):
 
         """
         Given a successful search, this method will format the data and return it
@@ -125,7 +126,12 @@ class SearchEpc:
         uprns = {r["uprn"] for r in rows}
 
         if len(uprns) != 1:
-            raise NotImplementedError("More than one unique UPRN, need to handle this case")
+            logger.error("Multiple EPCs found - we should use an alternate method of searching - TODO")
+            if property_type is not None:
+                # We can do a filter on the property type
+                rows_filtered = [r for r in rows if r["property-type"] == property_type]
+                if rows_filtered:
+                    rows = rows_filtered
 
         # We now check for a full sap epc:
         full_sap_epc = [r for r in rows if r["transaction-type"] == "new dwelling"]
diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py
index 18b4ecd1..a24fd2d5 100644
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@@ -37,20 +37,19 @@ class Eligibility:
 
     def parse_fabric(self, key):
 
-        if "thermal transmittance" in self.epc[key]:
-            if key == "walls-description":
-                return WallAttributes(self.epc["walls-description"]).process()
-
-            if key == "roof-description":
-                return RoofAttributes(self.epc["roof-description"]).process()
-
-            raise ValueError("Invalid Key")
-
         # Get the cleaned version of the description
-        return [
+        remapped = [
             data for data in self.cleaned[key] if
             data["original_description"] == self.epc[key]
-        ][0]
+        ]
+        if remapped:
+            return remapped[0]
+
+        if key == "walls-description":
+            return WallAttributes(self.epc["walls-description"]).process()
+
+        if key == "roof-description":
+            return RoofAttributes(self.epc["roof-description"]).process()
 
     def loft_insulation(self, loft_thickness_threshold: int = None):
         """
@@ -72,7 +71,7 @@ class Eligibility:
 
         if not is_loft:
             self.loft = {
-                "suitablility": False,
+                "suitability": False,
                 "thickness": None,
                 "reason": "roof not loft"
             }
@@ -88,14 +87,14 @@ class Eligibility:
         if insulation_thickness > loft_thickness_threshold:
             # Insulation is already thick enough
             self.loft = {
-                "suitablility": False,
+                "suitability": False,
                 "thickness": insulation_thickness,
                 "reason": "existing insulation"
             }
             return
 
         self.loft = {
-            "suitablility": True,
+            "suitability": True,
             "thickness": insulation_thickness,
             "reason": None
         }
@@ -121,7 +120,7 @@ class Eligibility:
 
         if is_unfilled_cavity:
             self.cavity = {
-                "suitablility": True,
+                "suitability": True,
                 "type": "empty",
             }
             return
@@ -170,7 +169,11 @@ class Eligibility:
         self.cavity_insulation()
         self.loft_insulation()
 
-        self.gbis = (self.cavity["suitablility"] or self.loft["suitibility"]) and (
+        # self.gbis = (self.cavity["suitability"] or self.loft["suitability"]) and (
+        #     int(self.epc["current-energy-efficiency"]) <= 68
+        # )
+
+        self.gbis = (self.cavity["suitability"]) and (
             int(self.epc["current-energy-efficiency"]) <= 68
         )
 
@@ -214,7 +217,7 @@ class Eligibility:
         self.loft_insulation()
 
         # make sure conditions 2 and 3 are true
-        is_eligible = self.cavity["suitablility"] & self.loft["suitablility"]
+        is_eligible = self.cavity["suitability"] & self.loft["suitability"]
 
         if post_retrofit_sap is None:
             self.eco4 = {
diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py
index 2cdee129..dd27f7c1 100644
--- a/etl/eligibility/ha_15_32/app.py
+++ b/etl/eligibility/ha_15_32/app.py
@@ -8,7 +8,7 @@ from tqdm import tqdm
 import pandas as pd
 import numpy as np
 import msgpack
-from datetime import datetime
+from datetime import datetime, timedelta
 from utils.logger import setup_logger
 from utils.s3 import read_from_s3
 from dotenv import load_dotenv
@@ -87,6 +87,14 @@ def marge_ha_32(asset_list, identified_addresses):
         identified_addresses["Address"]
     )
 
+    # Update how 7 Norton grove is listed as it has the wrong postcode
+    asset_list["Postcode"] = np.where(
+        (asset_list["Street"] == "Norton Grove") & (asset_list["Postcode"] == "HU4 6HQ") & (
+            asset_list["Dwelling num"] == "7"),
+        "HU4 6HG",
+        asset_list["Postcode"]
+    )
+
     asset_list["merge_key"] = (
         asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
         asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") +
@@ -398,6 +406,208 @@ def prepare_model_data_row(property_id, modelling_epc, cleaned, cleaning_data, c
     return scoring_dict
 
 
+def get_ha_32data(ha_data, cleaned, cleaning_data, created_at):
+    house_number_key = "Dwelling num"
+    address_key = "Street"
+    postcode_key = "Postcode"
+    house_name = "Dwelling name"
+    house_type_key = "Dwelling type"
+
+    house_type_lookup = {
+        "Bungalow": "Bungalow",
+        "Flat": "Flat",
+        'House': "House",
+        'Store Room': None,
+        'Bungalow Disabled': "Bungalow",
+        'Flat Disabled': "Flat",
+        'Dormer Bungalow': "Bungalow",
+        'Pop-In': None,
+        'Laundry': None,
+        'Shed': None,
+        'Bedsit': None,
+    }
+
+    scoring_data = []
+    results = []
+    no_house_numbers = []
+    for _, house in tqdm(ha_data.iterrows(), total=len(ha_data)):
+
+        # If we don't have a house number, we'll continue since we won't realistically be able to find
+        # an address
+        if pd.isnull(house[house_number_key]):
+            no_house_numbers.append(house["row_id"])
+            continue
+
+        if house_name is not None:
+            if not pd.isnull(house[house_name]):
+                address1 = " ".join([house[house_name], house[house_number_key], house[address_key]])
+            else:
+                address1 = " ".join([house[house_number_key], house[address_key]])
+        else:
+            address1 = " ".join([house[house_number_key], house[address_key]])
+
+        searcher = SearchEpc(
+            address1=address1,
+            postcode=house[postcode_key]
+        )
+
+        response = searcher.search()
+        if response["status"] == 204:
+            # If the property is identified, we should fix this
+            # if house["identified"]:
+            #     raise NotImplementedError("Check if we have an epc")
+            results.append(
+                {
+                    "row_id": house["row_id"],
+                    "warmfront_identified": house["identified"],
+                    "gbis_eligible": None,
+                    "eco4_eligible": None,
+                    "date_epc": None,
+                    "message": "No EPC found",
+                }
+            )
+            continue
+
+        newest_epc, older_epcs, _ = searcher.retrieve(
+            property_type=house_type_lookup.get(house[house_type_key], None)
+        )
+        # We also want to get the penultimate epc
+        penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
+        if not penultimate_epc:
+            penultimate_epc = newest_epc
+
+        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
+        eligibility.check_gbis()
+        eligibility.check_eco4()
+
+        # If there is no eligibility, we need to check the penultimate epc
+        if (not eligibility.eco4["eligible"]) and (not eligibility.gbis):
+            eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
+            eligibility.check_gbis()
+            eligibility.check_eco4()
+
+        if eligibility.eco4["eligible"]:
+            # TODO: Check me
+            scoring_dictionary = prepare_model_data_row(
+                property_id=house["row_id"],
+                modelling_epc=eligibility.epc,
+                cleaned=cleaned,
+                cleaning_data=cleaning_data,
+                created_at=created_at
+            )
+            scoring_data.append(scoring_dictionary)
+            results.append(
+                {
+                    "row_id": house["row_id"],
+                    "warmfront_identified": house["identified"],
+                    "gbis_eligible": eligibility.gbis,
+                    "eco4_eligible": eligibility.eco4["eligible"],
+                    "date_epc": eligibility.epc["lodgement-date"],
+                    "message": "eco4 conditional on post sap",
+                }
+            )
+            continue
+
+        # if (house["identified"] and not eligibility.gbis) and (
+        #     house["identified"] and not eligibility.eco4["eligible"]):
+        #     raise NotImplementedError("Investigate ms")
+
+        # If nothing is eligible or gbis is eligible, then we make a record this
+        results.append(
+            {
+                "row_id": house["row_id"],
+                "warmfront_identified": house["identified"],
+                "gbis_eligible": eligibility.gbis,
+                "eco4_eligible": eligibility.eco4["eligible"],
+                "date_epc": eligibility.epc["lodgement-date"],
+                "message": None
+            }
+        )
+
+    logger.info("no_house_numbers")
+
+    return results, scoring_data
+
+
+def analyse_ha_32_results(results, ha32, no_house_numbers):
+    results_df = pd.DataFrame(results)
+    import pickle
+    # with open("ha_32_results.pickle", "wb") as f:
+    #     pickle.dump(results_df, f)
+
+    """
+    We want to know:
+    1) What proportion of identified properties we get correct
+    2) If we miss identified properties, why
+    3) Which properties do we identify that were not identified by warmfront. What is our confidence on these?
+
+    For HA32, most of these (if not all) properties were identified under gbis
+    """
+
+    # What proportio
+    warmfront_identified = results_df[
+        results_df["warmfront_identified"]
+    ]
+
+    success_rate = warmfront_identified["gbis_eligible"].sum() / warmfront_identified.shape[0]
+    # For HA32, this is 89%
+
+    # missed = results_df[
+    #     results_df["warmfront_identified"] & (warmfront_identified["gbis_eligible"] != True)
+    #     ]
+
+    # to_check = missed[pd.isnull(missed["message"])]
+
+    # ha32[ha32["row_id"] == to_check["row_id"].values[14]].squeeze()
+    # to_check[to_check["row_id"] == to_check["row_id"].values[14]].squeeze()
+
+    # For these properties, warmfront identified all of them, however two did not seem to look valid.
+    # We could perhaps update our detection, if the properties not found are not currently EPC C or above, but
+    # do not look eligible from a building materials perspective
+    # E.g.:
+    # row_ids = ha32[ha32["Postcode"] == "HU4 6TG"]["row_id"].values
+    # z = results_df[results_df["row_id"].isin(row_ids)]
+
+    # Reason 1: The EPC indicates that the cavity is filled (GBIS allows for more than just cavity measures, however
+    #           we check ust the cavity for GBIS homes, since I believe this is what Warmfront have in place with
+    #           regards to commercial agreements with the installer. An example of this is 30 Coxwold Grove,
+    #           HU4 6HH.
+    #
+    # Reason 2: Some properties do not have any existing data. This amounts for 16 of the 50 that we missed.
+    #           We will be implemntating a solution to interpolate homes that do not have any data, based on their
+    #           neighbours. An example of this is 979 Hessle Road, HU4 6QG. If we look at the neighbours, we would
+    #           likely infer that this property has an empty cavity and therefore would identify
+    #
+    # Reason 3: Some properties, e.g. 975 Hessle Road, HU4 6QG, look like they would quality for GBIS,
+    #           but is already a C, based on its Nov 2022 EPC (it was a C before that too). I'm personally not sure
+    #           why this home would get identified as you would not be able to get GBIS funding. Same for 977 Hessle
+    #           road. This was the most common reason. Another example 8 Edith Cavell Court, HU5 4BA
+    #
+    # Reason 4: Some properties are a combination of reason 1 and 3. This could be to do with inaccurate EPCs as
+    #           emperically speaking, when going through this manually, it seemed like the ones that fall into this
+    #           category had slightly older EPCs (pre-2019). There are a few like this but e.g
+    #           3, Summergroves Way HU4 6SZ
+
+    # We now look for properties that we identified, that were not identified by Warmfront
+
+    new_possibilities = results_df[
+        (~results_df["warmfront_identified"]) &
+        (results_df["gbis_eligible"] | results_df["eco4_eligible"])
+        ].copy()
+
+    # We deem that Any EPC that is produced in the last 3 years gives us good confidence
+    cutoff_date = datetime.now() - timedelta(days=3 * 365)
+
+    new_possibilities["high_confidence"] = pd.to_datetime(new_possibilities["date_epc"]) >= cutoff_date
+
+    # We do a quick check on properties that didn't have a house number:
+    no_house_numbers_ha32 = ha32[ha32["row_id"].isin(no_house_numbers)]["identified"].sum()
+    if no_house_numbers_ha32:
+        logger.error("We have some identified properties that have no house numbers - investigate me")
+
+    return success_rate, new_possibilities
+
+
 def app():
     ha32_asset_list, ha15_asset_list, ha32_identified_addresses, ha15_identified_addresses = load_data()
 
@@ -422,72 +632,5 @@ def app():
     # We want to retrieve EPCs for every single property
     # NOTE: HA32 is MOSTLY cavity via GBIS
     ha_data = ha32
-    house_number_key = "Dwelling num"
-    address_key = "Street"
-    postcode_key = "Postcode"
 
-    def get_data(ha_data, house_number_key, address_key, postcode_key):
-
-        scoring_data = []
-        results = []
-        for _, house in tqdm(ha_data.iterrows(), total=len(ha_data)):
-            searcher = SearchEpc(
-                address1=" ".join([house[house_number_key], house[address_key]]),
-                postcode=house[postcode_key]
-            )
-
-            searcher.search()
-
-            newest_epc, older_epcs, _ = searcher.retrieve()
-            # We also want to get the penultimate epc
-            penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
-            if not penultimate_epc:
-                penultimate_epc = newest_epc
-
-            eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
-            eligibility.check_gbis()
-            eligibility.check_eco4()
-
-            # If there is no eligibility, we need to check the penultimate epc
-            if (not eligibility.eco4["eligible"]) and (not eligibility.gbis):
-                eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
-                eligibility.check_gbis()
-                eligibility.check_eco4()
-
-            if eligibility.eco4["eligible"]:
-                # TODO: Check me
-                scoring_dictionary = prepare_model_data_row(
-                    property_id=house["row_id"],
-                    modelling_epc=eligibility.epc,
-                    cleaned=cleaned,
-                    cleaning_data=cleaning_data,
-                    created_at=created_at
-                )
-                scoring_data.append(scoring_dictionary)
-                results.append(
-                    {
-                        "row_id": house["row_id"],
-                        "warmfront_identified": house["identified"],
-                        "gbis_eligible": eligibility.gbis,
-                        "eco4_eligible": eligibility.eco4["eligible"],
-                        "date_epc": eligibility.epc["lodgement-date"],
-                        "eco4_note": "conditional on post sap"
-                    }
-                )
-                continue
-
-            if (house["identified"] and not eligibility.gbis) and (
-                house["identified"] and not eligibility.eco4["eligible"]):
-                raise NotImplementedError("Investigate ms")
-
-            # If nothing is eligible or gbis is eligible, then we make a record this
-            results.append(
-                {
-                    "row_id": house["row_id"],
-                    "warmfront_identified": house["identified"],
-                    "gbis_eligible": eligibility.gbis,
-                    "eco4_eligible": eligibility.eco4["eligible"],
-                    "date_epc": eligibility.epc["lodgement-date"],
-                    "eco4_note": None
-                }
-            )
+    ha_32_results = get_ha_32data(ha_data, cleaned, cleaning_data, created_at)