From 6a327629bf0ab5284b1b951cc98360597f30ce1f Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 12 Mar 2024 11:09:09 +0000
Subject: [PATCH] rough attempt to attribute surplus ciga dependent eco4 jobs

---
 .../ha_15_32/ha_analysis_batch_3.py           | 144 +++++++++++++-----
 1 file changed, 107 insertions(+), 37 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index d556450b..5ad1aa27 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -176,6 +176,10 @@ class DataLoader:
             "address": "Full Address",
             "postcode": "Postcode"
         },
+        "HA49": {
+            "address": "Property Address Full",
+            "postcode": "Property Postcode"
+        },
         "HA54": {
             "address": "Postal Address",
             "postcode": "matching_postcode"
@@ -219,7 +223,7 @@ class DataLoader:
 
     def create_asset_list_matching_address(self, ha_name, asset_list):
 
-        if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA54"]:
+        if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA49", "HA54"]:
             asset_list["matching_address"] = asset_list[
                 self.COLUMN_CONFIG[ha_name]["address"]
             ].astype(str).str.lower().str.strip()
@@ -382,6 +386,16 @@ class DataLoader:
                                              asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
                                              asset_list["PostCode"].astype(str).str.lower().str.strip()
             asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
+        elif ha_name == "HAXX":
+            asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["PostCode"].astype(str).str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
+        elif ha_name == "HAXXX":
+            asset_list["matching_address"] = (
+                asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " +
+                asset_list["Postcode"].astype(str).str.lower().str.strip()
+            )
+            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
         else:
             raise NotImplementedError("implement me")
 
@@ -467,6 +481,8 @@ class DataLoader:
             asset_list["HouseNo"] = asset_list["House_Number"].copy()
         elif ha_name == "HA9":
             asset_list["HouseNo"] = asset_list["House Number"].copy()
+        elif ha_name == "HAXXX":
+            asset_list["HouseNo"] = asset_list["Door Number"].copy()
         else:
             split_addresses = asset_list['matching_address'].str.split(',', expand=True)
             house_numbers = split_addresses[0].str.split(' ', expand=True)
@@ -1999,6 +2015,10 @@ class DataLoader:
 
         return survey_list
 
+    @staticmethod
+    def correct_ha49_survey_list(survey_list):
+        return survey_list
+
     @staticmethod
     def levenstein_match(matching_string, df):
         match_to = df["matching_address"].tolist()
@@ -5080,8 +5100,11 @@ def app():
     # Add in:
     priority_has = [
         "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25",
-        "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA54", "HA56", "HA63",
-        "HA107", "HA117"
+        "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56",
+        "HA63", "HA107", "HA117",
+
+        # New HAS
+        "HAXX", "HAXXX",
     ]
     # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come
     # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE],
@@ -5100,39 +5123,86 @@ def app():
 
     forecast_remaining_sales(loader)
 
-    # We load in the additional data required to perform the analysis
-    # cleaned = read_from_s3(
-    #     s3_file_name="cleaned_epc_data/cleaned.bson",
-    #     bucket_name="retrofit-data-dev"
-    # )
-    # cleaned = msgpack.unpackb(cleaned, raw=False)
-    # cleaned = patch_cleaned(cleaned)
-    #
-    # cleaning_data = read_dataframe_from_s3_parquet(
-    #     bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
-    # )
-    # created_at = datetime.now().isoformat()
-    #
-    # photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
-    #
-    # outputs = get_epc_data(
-    #     loader=loader,
-    #     cleaned=cleaned,
-    #     cleaning_data=cleaning_data,
-    #     created_at=created_at,
-    #     photo_supply_lookup=photo_supply_lookup,
-    #     floor_area_decile_thresholds=floor_area_decile_thresholds,
-    #     pull_data=pull_data
-    # )
+    conversion_rate = 0.95
+    archetype_check_conversion = 0.7
+    res = []
+    for k, v in loader.data.items():
+        asset_list = v["asset_list"].copy()
+        agg = asset_list["ECO Eligibility"].value_counts()
+        # We find a case where there are properties that have passed CIGA
+        if not any("passed" in x for x in agg.index):
+            continue
 
-    # import pickle
-    # with open("ha_analysis.pickle", "wb") as f:
-    #     pickle.dump({"outputs": outputs, "loader": loader}, f)
+        agg = pd.DataFrame(agg).reset_index()
 
-    # To read:
-    # import pickle
-    # with open("ha_analysis.pickle", "rb") as f:
-    #     outputs = pickle.load(f)["outputs"]
-    #
-    # with open("loader.pickle", "rb") as f:
-    #     loader = pickle.load(f)
+        passed_ciga = agg[agg["ECO Eligibility"] == "eco4 - passed ciga"]
+        passed_ciga = passed_ciga["count"].values[0] if not passed_ciga.empty else 0
+
+        failed_ciga = agg[agg["ECO Eligibility"] == "failed ciga"]
+        failed_ciga = failed_ciga["count"].values[0] if not failed_ciga.empty else 0
+
+        ciga_pass_rate = passed_ciga / (passed_ciga + failed_ciga) if (passed_ciga + failed_ciga) > 0 else 1
+
+        dormant_ciga = agg[
+            agg["ECO Eligibility"].str.contains("subject to ciga") &
+            ~agg["ECO Eligibility"].str.contains("subject to archetype")
+            ]
+
+        dormant_ciga = dormant_ciga['count'].values[0] if not dormant_ciga.empty else 0
+
+        dormant_ciga_archetype = agg[
+            agg["ECO Eligibility"].str.contains("subject to ciga") &
+            agg["ECO Eligibility"].str.contains("subject to archetype")
+            ]
+
+        dormant_ciga_archetype = dormant_ciga_archetype['count'].values[0] if not dormant_ciga_archetype.empty else 0
+
+        needing_check = dormant_ciga + dormant_ciga_archetype * archetype_check_conversion
+        needing_check = np.round(needing_check)
+
+        additional_jobs = (dormant_ciga * ciga_pass_rate * conversion_rate) + (
+            dormant_ciga_archetype * archetype_check_conversion * ciga_pass_rate * conversion_rate
+        )
+        additional_jobs = np.round(additional_jobs)
+
+        # We attempt to estimate the uplift and how much of that is attributed to surplus subject to ciga jobs
+        original_estimate = loader.december_figures[
+            loader.december_figures["HA Name"] == k
+            ]
+
+        original_estimate = original_estimate["ECO4"].values[0] if not original_estimate.empty else 0
+        base_eco_figures = agg[
+            agg["ECO Eligibility"].isin(["eco4", "eco4 - passed ciga"])
+        ]["count"].sum()
+        eco4_from_ciga = original_estimate - base_eco_figures
+        eco4_from_ciga = eco4_from_ciga if eco4_from_ciga > 0 else 0
+        surplus_from_dormant = additional_jobs - eco4_from_ciga
+        surplus_from_dormant = 0 if surplus_from_dormant < 0 else surplus_from_dormant
+
+        res.append(
+            {
+                "ha_name": k,
+                "additional_eco4": additional_jobs,
+                "needing_check": needing_check,
+                "surplus_from_dormant": surplus_from_dormant
+            }
+        )
+
+    res = pd.DataFrame(res)
+    # Drop the HAs that are not in that pervious draft
+    # In the v2 draft, there are 12 HAs
+
+    v5_surplus = res[
+        ~res["ha_name"].isin(["HA9"])
+    ]["additional_eco4"].sum()
+    # 7212 properties
+    # This is not a perfect difference though, because of the variations in how the numbers are recorded in the November
+    # all HAs sheet. E.g for HA 107, there were 1239 properties identified. In the postcode list, there are 1255,
+    # however 531 are still needing a CIGA check. Therefore their original figures, in this case, included properties
+    # pre-CIGA
+
+    v5_surplus_from_dormant = res[
+        ~res["ha_name"].isin(["HA9"])
+    ]["surplus_from_dormant"].sum()
+    # 5539.0
+    # 9471690