From 6a327629bf0ab5284b1b951cc98360597f30ce1f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Mar 2024 11:09:09 +0000 Subject: [PATCH] rough attempt to attribute surplus ciga dependent eco4 jobs --- .../ha_15_32/ha_analysis_batch_3.py | 144 +++++++++++++----- 1 file changed, 107 insertions(+), 37 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index d556450b..5ad1aa27 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -176,6 +176,10 @@ class DataLoader: "address": "Full Address", "postcode": "Postcode" }, + "HA49": { + "address": "Property Address Full", + "postcode": "Property Postcode" + }, "HA54": { "address": "Postal Address", "postcode": "matching_postcode" @@ -219,7 +223,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA54"]: + if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA49", "HA54"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -382,6 +386,16 @@ class DataLoader: asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["PostCode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() + elif ha_name == "HAXX": + asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["PostCode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() + elif ha_name == "HAXXX": + asset_list["matching_address"] = ( + asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " + + asset_list["Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() else: raise NotImplementedError("implement me") @@ -467,6 +481,8 @@ class DataLoader: asset_list["HouseNo"] = asset_list["House_Number"].copy() elif ha_name == "HA9": asset_list["HouseNo"] = asset_list["House Number"].copy() + elif ha_name == "HAXXX": + asset_list["HouseNo"] = asset_list["Door Number"].copy() else: split_addresses = asset_list['matching_address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) @@ -1999,6 +2015,10 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha49_survey_list(survey_list): + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -5080,8 +5100,11 @@ def app(): # Add in: priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", - "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA54", "HA56", "HA63", - "HA107", "HA117" + "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", + "HA63", "HA107", "HA117", + + # New HAS + "HAXX", "HAXXX", ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], @@ -5100,39 +5123,86 @@ def app(): forecast_remaining_sales(loader) - # We load in the additional data required to perform the analysis - # cleaned = read_from_s3( - # s3_file_name="cleaned_epc_data/cleaned.bson", - # bucket_name="retrofit-data-dev" - # ) - # cleaned = msgpack.unpackb(cleaned, raw=False) - # cleaned = patch_cleaned(cleaned) - # - # cleaning_data = read_dataframe_from_s3_parquet( - # bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", - # ) - # created_at = datetime.now().isoformat() - # - # photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") - # - # outputs = get_epc_data( - # loader=loader, - # cleaned=cleaned, - # cleaning_data=cleaning_data, - # created_at=created_at, - # photo_supply_lookup=photo_supply_lookup, - # floor_area_decile_thresholds=floor_area_decile_thresholds, - # pull_data=pull_data - # ) + conversion_rate = 0.95 + archetype_check_conversion = 0.7 + res = [] + for k, v in loader.data.items(): + asset_list = v["asset_list"].copy() + agg = asset_list["ECO Eligibility"].value_counts() + # We find a case where there are properties that have passed CIGA + if not any("passed" in x for x in agg.index): + continue - # import pickle - # with open("ha_analysis.pickle", "wb") as f: - # pickle.dump({"outputs": outputs, "loader": loader}, f) + agg = pd.DataFrame(agg).reset_index() - # To read: - # import pickle - # with open("ha_analysis.pickle", "rb") as f: - # outputs = pickle.load(f)["outputs"] - # - # with open("loader.pickle", "rb") as f: - # loader = pickle.load(f) + passed_ciga = agg[agg["ECO Eligibility"] == "eco4 - passed ciga"] + passed_ciga = passed_ciga["count"].values[0] if not passed_ciga.empty else 0 + + failed_ciga = agg[agg["ECO Eligibility"] == "failed ciga"] + failed_ciga = failed_ciga["count"].values[0] if not failed_ciga.empty else 0 + + ciga_pass_rate = passed_ciga / (passed_ciga + failed_ciga) if (passed_ciga + failed_ciga) > 0 else 1 + + dormant_ciga = agg[ + agg["ECO Eligibility"].str.contains("subject to ciga") & + ~agg["ECO Eligibility"].str.contains("subject to archetype") + ] + + dormant_ciga = dormant_ciga['count'].values[0] if not dormant_ciga.empty else 0 + + dormant_ciga_archetype = agg[ + agg["ECO Eligibility"].str.contains("subject to ciga") & + agg["ECO Eligibility"].str.contains("subject to archetype") + ] + + dormant_ciga_archetype = dormant_ciga_archetype['count'].values[0] if not dormant_ciga_archetype.empty else 0 + + needing_check = dormant_ciga + dormant_ciga_archetype * archetype_check_conversion + needing_check = np.round(needing_check) + + additional_jobs = (dormant_ciga * ciga_pass_rate * conversion_rate) + ( + dormant_ciga_archetype * archetype_check_conversion * ciga_pass_rate * conversion_rate + ) + additional_jobs = np.round(additional_jobs) + + # We attempt to estimate the uplift and how much of that is attributed to surplus subject to ciga jobs + original_estimate = loader.december_figures[ + loader.december_figures["HA Name"] == k + ] + + original_estimate = original_estimate["ECO4"].values[0] if not original_estimate.empty else 0 + base_eco_figures = agg[ + agg["ECO Eligibility"].isin(["eco4", "eco4 - passed ciga"]) + ]["count"].sum() + eco4_from_ciga = original_estimate - base_eco_figures + eco4_from_ciga = eco4_from_ciga if eco4_from_ciga > 0 else 0 + surplus_from_dormant = additional_jobs - eco4_from_ciga + surplus_from_dormant = 0 if surplus_from_dormant < 0 else surplus_from_dormant + + res.append( + { + "ha_name": k, + "additional_eco4": additional_jobs, + "needing_check": needing_check, + "surplus_from_dormant": surplus_from_dormant + } + ) + + res = pd.DataFrame(res) + # Drop the HAs that are not in that pervious draft + # In the v2 draft, there are 12 HAs + + v5_surplus = res[ + ~res["ha_name"].isin(["HA9"]) + ]["additional_eco4"].sum() + # 7212 properties + # This is not a perfect difference though, because of the variations in how the numbers are recorded in the November + # all HAs sheet. E.g for HA 107, there were 1239 properties identified. In the postcode list, there are 1255, + # however 531 are still needing a CIGA check. Therefore their original figures, in this case, included properties + # pre-CIGA + + v5_surplus_from_dormant = res[ + ~res["ha_name"].isin(["HA9"]) + ]["surplus_from_dormant"].sum() + # 5539.0 + # 9471690