rough attempt to attribute surplus ciga dependent eco4 jobs

2026-06-08 11:17:27 +00:00 · 2024-03-12 11:09:09 +00:00 · 2024-03-12 11:09:09 +00:00 · 6a327629bf
commit 6a327629bf
parent 41c17aa1da
1 changed files with 107 additions and 37 deletions
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@ -176,6 +176,10 @@ class DataLoader:
            "address": "Full Address",
            "postcode": "Postcode"
        },
+        "HA49": {
+            "address": "Property Address Full",
+            "postcode": "Property Postcode"
+        },
        "HA54": {
            "address": "Postal Address",
            "postcode": "matching_postcode"
@ -219,7 +223,7 @@ class DataLoader:

    def create_asset_list_matching_address(self, ha_name, asset_list):

-        if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA54"]:
+        if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA49", "HA54"]:
            asset_list["matching_address"] = asset_list[
                self.COLUMN_CONFIG[ha_name]["address"]
            ].astype(str).str.lower().str.strip()
@ -382,6 +386,16 @@ class DataLoader:
                                             asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
                                             asset_list["PostCode"].astype(str).str.lower().str.strip()
            asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
+        elif ha_name == "HAXX":
+            asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
+                                             asset_list["PostCode"].astype(str).str.lower().str.strip()
+            asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
+        elif ha_name == "HAXXX":
+            asset_list["matching_address"] = (
+                asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " +
+                asset_list["Postcode"].astype(str).str.lower().str.strip()
+            )
+            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
        else:
            raise NotImplementedError("implement me")

@ -467,6 +481,8 @@ class DataLoader:
            asset_list["HouseNo"] = asset_list["House_Number"].copy()
        elif ha_name == "HA9":
            asset_list["HouseNo"] = asset_list["House Number"].copy()
+        elif ha_name == "HAXXX":
+            asset_list["HouseNo"] = asset_list["Door Number"].copy()
        else:
            split_addresses = asset_list['matching_address'].str.split(',', expand=True)
            house_numbers = split_addresses[0].str.split(' ', expand=True)
@ -1999,6 +2015,10 @@ class DataLoader:

        return survey_list

+    @staticmethod
+    def correct_ha49_survey_list(survey_list):
+        return survey_list
+
    @staticmethod
    def levenstein_match(matching_string, df):
        match_to = df["matching_address"].tolist()
@ -5080,8 +5100,11 @@ def app():
    # Add in:
    priority_has = [
        "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25",
-        "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA54", "HA56", "HA63",
-        "HA107", "HA117"
+        "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56",
+        "HA63", "HA107", "HA117",
+
+        # New HAS
+        "HAXX", "HAXXX",
    ]
    # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come
    # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE],
@ -5100,39 +5123,86 @@ def app():

    forecast_remaining_sales(loader)

-    # We load in the additional data required to perform the analysis
-    # cleaned = read_from_s3(
-    #     s3_file_name="cleaned_epc_data/cleaned.bson",
-    #     bucket_name="retrofit-data-dev"
-    # )
-    # cleaned = msgpack.unpackb(cleaned, raw=False)
-    # cleaned = patch_cleaned(cleaned)
-    #
-    # cleaning_data = read_dataframe_from_s3_parquet(
-    #     bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
-    # )
-    # created_at = datetime.now().isoformat()
-    #
-    # photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
-    #
-    # outputs = get_epc_data(
-    #     loader=loader,
-    #     cleaned=cleaned,
-    #     cleaning_data=cleaning_data,
-    #     created_at=created_at,
-    #     photo_supply_lookup=photo_supply_lookup,
-    #     floor_area_decile_thresholds=floor_area_decile_thresholds,
-    #     pull_data=pull_data
-    # )
+    conversion_rate = 0.95
+    archetype_check_conversion = 0.7
+    res = []
+    for k, v in loader.data.items():
+        asset_list = v["asset_list"].copy()
+        agg = asset_list["ECO Eligibility"].value_counts()
+        # We find a case where there are properties that have passed CIGA
+        if not any("passed" in x for x in agg.index):
+            continue

-    # import pickle
-    # with open("ha_analysis.pickle", "wb") as f:
-    #     pickle.dump({"outputs": outputs, "loader": loader}, f)
+        agg = pd.DataFrame(agg).reset_index()

-    # To read:
-    # import pickle
-    # with open("ha_analysis.pickle", "rb") as f:
-    #     outputs = pickle.load(f)["outputs"]
-    #
-    # with open("loader.pickle", "rb") as f:
-    #     loader = pickle.load(f)
+        passed_ciga = agg[agg["ECO Eligibility"] == "eco4 - passed ciga"]
+        passed_ciga = passed_ciga["count"].values[0] if not passed_ciga.empty else 0
+
+        failed_ciga = agg[agg["ECO Eligibility"] == "failed ciga"]
+        failed_ciga = failed_ciga["count"].values[0] if not failed_ciga.empty else 0
+
+        ciga_pass_rate = passed_ciga / (passed_ciga + failed_ciga) if (passed_ciga + failed_ciga) > 0 else 1
+
+        dormant_ciga = agg[
+            agg["ECO Eligibility"].str.contains("subject to ciga") &
+            ~agg["ECO Eligibility"].str.contains("subject to archetype")
+            ]
+
+        dormant_ciga = dormant_ciga['count'].values[0] if not dormant_ciga.empty else 0
+
+        dormant_ciga_archetype = agg[
+            agg["ECO Eligibility"].str.contains("subject to ciga") &
+            agg["ECO Eligibility"].str.contains("subject to archetype")
+            ]
+
+        dormant_ciga_archetype = dormant_ciga_archetype['count'].values[0] if not dormant_ciga_archetype.empty else 0
+
+        needing_check = dormant_ciga + dormant_ciga_archetype * archetype_check_conversion
+        needing_check = np.round(needing_check)
+
+        additional_jobs = (dormant_ciga * ciga_pass_rate * conversion_rate) + (
+            dormant_ciga_archetype * archetype_check_conversion * ciga_pass_rate * conversion_rate
+        )
+        additional_jobs = np.round(additional_jobs)
+
+        # We attempt to estimate the uplift and how much of that is attributed to surplus subject to ciga jobs
+        original_estimate = loader.december_figures[
+            loader.december_figures["HA Name"] == k
+            ]
+
+        original_estimate = original_estimate["ECO4"].values[0] if not original_estimate.empty else 0
+        base_eco_figures = agg[
+            agg["ECO Eligibility"].isin(["eco4", "eco4 - passed ciga"])
+        ]["count"].sum()
+        eco4_from_ciga = original_estimate - base_eco_figures
+        eco4_from_ciga = eco4_from_ciga if eco4_from_ciga > 0 else 0
+        surplus_from_dormant = additional_jobs - eco4_from_ciga
+        surplus_from_dormant = 0 if surplus_from_dormant < 0 else surplus_from_dormant
+
+        res.append(
+            {
+                "ha_name": k,
+                "additional_eco4": additional_jobs,
+                "needing_check": needing_check,
+                "surplus_from_dormant": surplus_from_dormant
+            }
+        )
+
+    res = pd.DataFrame(res)
+    # Drop the HAs that are not in that pervious draft
+    # In the v2 draft, there are 12 HAs
+
+    v5_surplus = res[
+        ~res["ha_name"].isin(["HA9"])
+    ]["additional_eco4"].sum()
+    # 7212 properties
+    # This is not a perfect difference though, because of the variations in how the numbers are recorded in the November
+    # all HAs sheet. E.g for HA 107, there were 1239 properties identified. In the postcode list, there are 1255,
+    # however 531 are still needing a CIGA check. Therefore their original figures, in this case, included properties
+    # pre-CIGA
+
+    v5_surplus_from_dormant = res[
+        ~res["ha_name"].isin(["HA9"])
+    ]["surplus_from_dormant"].sum()
+    # 5539.0
+    # 9471690