From b6c57c7253ec86b59ef1599489a405a9466ce505 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 26 Jan 2024 17:17:43 +0000
Subject: [PATCH] created template of code to create the ha analysis results

---
 etl/eligibility/Eligibility.py                |   6 +-
 .../ha_15_32/ha_analysis_batch_3.py           | 242 +++++++++++++++---
 2 files changed, 207 insertions(+), 41 deletions(-)

diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py
index 00c72a8e..1d868338 100644
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@@ -177,15 +177,13 @@ class Eligibility:
         is_empty = (not self.walls["is_filled_cavity"]) or (
             self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["average", "above average"]
         )
-        is_partial_filled = (
-            self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["below average"]
-        )
+        is_partial_filled = "partial" in self.walls["clean_description"].lower()
         # We look for potentially under performing cavities - anything that is assumed, as built and insulated
         is_underperforming = (
             self.walls["is_as_built"] and self.walls["insulation_thickness"] in ["average"] and self.walls["is_assumed"]
         )
 
-        is_unfilled_cavity = is_cavity and is_empty
+        is_unfilled_cavity = is_cavity and (is_empty and not is_partial_filled)
         is_partial_filled_cavity = is_cavity and is_partial_filled
         is_underperforming_cavity = is_cavity and is_underperforming
 
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 1212522e..1ed95a30 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -901,6 +901,7 @@ def analyse_ha_data(outputs, loader):
     :return:
     """
 
+    ha_analysis_results = []
     for ha_name, datasets in outputs.items():
 
         inputs = [x for k, x in loader.data.items() if k == ha_name][0]
@@ -917,9 +918,20 @@ def analyse_ha_data(outputs, loader):
             inputs["asset_list"]["funding_scheme"]
         )
 
+        # TODO: Also temp, just for HA 6
+        if ha_name == "ha_6":
+            inputs["survey_list"]["funding_scheme"] = None
+            inputs["survey_list"]["funding_scheme"] = np.where(
+                inputs["survey_list"][
+                    'AFFORDABLE WARMTH                 OR EPC FOR HOUSING ASSOCIATION '] == "AFFORDABLE WARMTH",
+                "ECO4",
+                "GBIS"
+            )
+
         # End placholder
 
         results_df = datasets["results_df"].copy()
+
         analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename(
             columns={"row_meaning": "asset_identification_status"}
         ).merge(
@@ -929,23 +941,6 @@ def analyse_ha_data(outputs, loader):
             left_on="asset_list_row_id"
         )
 
-        # If we have a survey list, we merge this onto the results
-
-        n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique()
-
-        properties_sold = (
-            inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if
-            inputs["survey_list"] is not None else 0
-        )
-        properties_sold_eco4 = (
-            properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if
-            properties_sold != 0 else 0
-        )
-        properties_sold_gbis = (
-            properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if
-            properties_sold != 0 else 0
-        )
-
         # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is
         # remaining
 
@@ -956,8 +951,23 @@ def analyse_ha_data(outputs, loader):
             # Drop any rows that have a survey_list_row_id
             analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])]
 
+        # If we have a survey list, we merge this onto the results
+        n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique()
+
+        properties_sold = (
+            inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if
+            inputs["survey_list"] is not None else pd.DataFrame(columns=["funding_scheme"])
+        )
+        properties_sold_eco4 = (
+            properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if
+            (not properties_sold.empty) and ("ECO4" in properties_sold["funding_scheme"].values) else 0
+        )
+        properties_sold_gbis = (
+            properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if
+            (not properties_sold.empty) and ("GBIS" in properties_sold["funding_scheme"].values) else 0
+        )
+
         # We now calculate the number of remaining properties, by scheme
-        # TODO: We might need to tweak a bit of the logic
         remaining_properties = analysis_data[
             analysis_data["asset_identification_status"] == "identified potential eco works (CWI)"
             ].copy()
@@ -966,6 +976,7 @@ def analyse_ha_data(outputs, loader):
         remaining_properties_by_scheme = (
             remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index()
         )
+
         n_remaining_properties_eco4 = remaining_properties_by_scheme[
             remaining_properties_by_scheme["funding_scheme"] == "ECO4"
             ]["asset_list_row_id"].values[0]
@@ -983,13 +994,17 @@ def analyse_ha_data(outputs, loader):
         #    - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties
         #      here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have
         #      very old EPCs which may score lower when re-done
-        # 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity.
+        # 2) Meets Fabric requirements, not SAP
+        #    Warmfront has identified the property as eligible, but the EPC is not D or below. We consider this but
+        #    label is separately as not a strict
+        # 3) Subject to CIGA check - Meets loft conditions but shows a filled cavity.
         #    - we don't have a SAP constraint here because the EPC is (currently) showing what the property might
         #      actually look like after retrofit and so the EPC currently being a C or above means little, because
         #      the updated EPC, showing an empty cavity, could bring the property within
-        # 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation.
+        # 4) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation.
         #   - No SAP constraint, for the same reason as in category 2)
-        # 4) Does not look like ECO4 candidate
+        # 5) Looks like GBIS instead
+        # 6) Does not look like ECO4 candidate
         #
         # For properties that have been identified as GBIS
         # 1) Strict GBIS candidates
@@ -1000,43 +1015,156 @@ def analyse_ha_data(outputs, loader):
         remaining_eco4_df = remaining_properties[
             remaining_properties["funding_scheme"] == "ECO4"
             ].copy()
+
+        ####################################
         # ECO4
+        ####################################
+
         # 1) We identify this if:
         #   - remaining_properties["eco4_eligible"] == True
 
         remaining_eco4_df["prospect_type"] = np.where(
-            remaining_eco4_df["eco4_eligible"] == True,
+            (remaining_eco4_df["eco4_eligible"] == True),
             "strict ECO4",
             remaining_eco4_df["prospect_type"]
         )
 
-        # 2) We identify this if it has a filled cavity but meets the loft conditions
+        # 2) Meets fabric requirements
+        remaining_eco4_df["prospect_type"] = np.where(
+            (
+                (remaining_eco4_df["eco4_message"] == "sap too high") &
+                remaining_eco4_df["eligibility_cavity_type"].isin(["partial", "empty"]) &
+                remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"])
+            ),
+            "ECO4 if SAP downgrade",
+            remaining_eco4_df["prospect_type"]
+        )
 
-        remaining_eco4_df["prospect_type"]
+        # 3) We identify this if it has a filled cavity but meets the loft conditions
+        # TODO: Consider if we should also allow 100-270mm or if we should add some slight tolerance (e.g. 150mm)
+        #       to account for measurement error
+        remaining_eco4_df["prospect_type"] = np.where(
+            (
+                remaining_eco4_df["eligibility_cavity_type"].isin(["full"]) &
+                remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"])
+            ),
+            "Filled cavity - subject to CIGA check",
+            remaining_eco4_df["prospect_type"]
+        )
 
-        z = remaining_eco4_df[remaining_eco4_df["eco4_message"] == "sap too high"]
+        # 4) We identify this by ensuring the cavity if empty or partial, and the loft has between 101 and 270mm
+        remaining_eco4_df["prospect_type"] = np.where(
+            (
+                remaining_eco4_df["eligibility_cavity_type"].isin(["empty", "partial"]) &
+                remaining_eco4_df["eligibility_loft_type"].isin(["100-270mm"])
+            ),
+            "ECO4 prospect - empty cavity, loft insulation below regulation",
+            remaining_eco4_df["prospect_type"]
+        )
 
-        remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts()
-        remaining_properties["eco4_message"].value_counts()
-        z = remaining_properties[
-            (remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") &
-            (remaining_properties["eco4_eligible"] == True)
-            ]
+        # 5) Looks like GBIS instead
+        remaining_eco4_df["prospect_type"] = np.where(
+            (remaining_eco4_df["gbis_eligible"] == True),
+            "Looks like GBIS",
+            remaining_eco4_df["prospect_type"]
+        )
 
-        k = z[z["property_type"] == "Flat"]
-        k["uprn"]
+        # 6) This is everything else (i.e. both the cavity is full and the loft insulation is above 100mm)
+        remaining_eco4_df["prospect_type"] = remaining_eco4_df["prospect_type"].fillna(
+            "Does not look like ECO4 candidate"
+        )
 
-        ha_analysis_results = {
+        ####################################
+        # GBIS
+        ####################################
+
+        remaining_gbis = remaining_properties[
+            remaining_properties["funding_scheme"] == "GBIS"
+            ].copy()
+
+        # 1) Strict GBIS candidates
+        remaining_gbis["prospect_type"] = np.where(
+            (
+                (remaining_gbis["gbis_eligible"] == True) & (remaining_gbis["eco4_eligible"] == False)
+            ),
+            "strict GBIS",
+            remaining_gbis["prospect_type"]
+        )
+
+        # 2) GBIS candidates that look like strict ECO4 candidates
+        remaining_gbis["prospect_type"] = np.where(
+            (remaining_gbis["eco4_eligible"] == True),
+            "Upgradable to ECO4",
+            remaining_gbis["prospect_type"]
+        )
+
+        # 3) Subject to CIGA check - Filled cavity
+        remaining_gbis["prospect_type"] = np.where(
+            (
+                remaining_gbis["eligibility_cavity_type"].isin(["full"])
+            ),
+            "Filled cavity - subject to CIGA check",
+            remaining_gbis["prospect_type"]
+        )
+
+        # 4) Everything else
+        remaining_gbis["prospect_type"] = remaining_gbis["prospect_type"].fillna(
+            "Does not look like GBIS candidate"
+        )
+
+        ####################################
+        # Surplus properties
+        ####################################
+
+        # Take properties that were not identified by Warmfront and identify those that look like they would qualify
+        # under the strictest criteria
+        surplus_df = analysis_data[
+            analysis_data["asset_identification_status"] != "identified potential eco works (CWI)"
+            ].copy()
+
+        eco4_surplus = surplus_df[
+            (
+                (surplus_df["eco4_eligible"] == True) & (surplus_df["eco4_message"] == "subject to post retrofit sap") &
+                (
+                    surplus_df["eligibility_classification"].isin(
+                        ["high confidence", "highest confidence", "medium confidence"]
+                    )
+                )
+            )
+        ].copy()
+
+        gbis_surplus = surplus_df[
+            (
+                (surplus_df["gbis_eligible"] == True) & (surplus_df["eco4_eligible"] == False) & (
+                surplus_df["eligibility_cavity_type"].isin(["empty", "partial"])
+            )
+            )
+        ].copy()
+
+        ha_analysis_results.append({
             "n_properties_in_asset_list": n_properties_in_asset_list,
+            ############
             # ECO4
+            ############
             "properties_sold_eco4": properties_sold_eco4,
             "n_remaining_properties_eco4": n_remaining_properties_eco4,
+            **remaining_eco4_df["prospect_type"].value_counts().to_dict(),
+            ############
             # GBIS
+            ############
             "properties_sold_gbis": properties_sold_gbis,
-            "n_remaining_properties_gbis": n_remaining_properties_gbis
-        }
+            "n_remaining_properties_gbis": n_remaining_properties_gbis,
+            **remaining_gbis["prospect_type"].value_counts().to_dict(),
+            ############
+            # GBIS
+            ############
+            "n_eco4_surplus": eco4_surplus.shape[0],
+            "n_gbis_surplus": gbis_surplus.shape[0],
+        })
 
-    pass
+    ha_analysis_results = pd.DataFrame(ha_analysis_results)
+
+    # Todo: create revenue figures and automate creation of excel
 
 
 def app():
@@ -1152,3 +1280,43 @@ def app():
     outputs = get_epc_data(
         loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=False
     )
+
+    # for ha_name, datasets in outputs.items():
+    #     datasets["results_df"] = datasets["results_df"].drop(
+    #         columns=["eligibility_cavity_type", "eligibility_loft_type"]
+    #     )
+    #
+    #     # Re-do
+    #     res = []
+    #     for _, row in tqdm(datasets["results_df"].iterrows(), total=datasets["results_df"].shape[0]):
+    #         epc = {
+    #             "walls-description": row["walls"],
+    #             "roof-description": row["roof"],
+    #             "floor-description": "",
+    #             "tenure": "",
+    #             "current-energy-efficiency": row["sap"],
+    #         }
+    #         eligibility = Eligibility(epc=epc, cleaned=cleaned)
+    #         eligibility.check_eco4_warmfront()
+    #         res.append(
+    #             {
+    #                 "row_id": row["row_id"],
+    #                 "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"],
+    #                 "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"]
+    #             }
+    #         )
+    #
+    #     # Merge back on
+    #     res = pd.DataFrame(res)
+    #     datasets["results_df"] = datasets["results_df"].merge(res, how="left", on="row_id")
+    #
+    #     # Re-save in s3
+    #     save_pickle_to_s3(
+    #         data={
+    #             "results_df": datasets["results_df"],
+    #             "scoring_df": datasets["scoring_df"],
+    #             "nodata": datasets["nodata"]
+    #         },
+    #         bucket_name="retrofit-datalake-dev",
+    #         s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
+    #     )