diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9cadaf9f..e1d7db4d 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -20,6 +20,9 @@ from backend.ml_models.api import ModelApi from etl.solar.SolarPhotoSupply import SolarPhotoSupply from recommendations.recommendation_utils import calculate_cavity_age from etl.epc.Record import EPCRecord +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes +from etl.epc.DataProcessor import EPCDataProcessor +from datetime import datetime EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" @@ -5188,9 +5191,6 @@ def classify_loft(x): def fml_analysis(loader): - from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes - from etl.epc.DataProcessor import EPCDataProcessor - from datetime import datetime assumed_ciga_pass_rate = 0.731 has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16"] @@ -5216,15 +5216,20 @@ def fml_analysis(loader): bucket_name="retrofit-datalake-dev", s3_file_name=f"ha-analysis/revised/{ha_name}/epc_data.pickle" ) + # We make sure we don't have duplicated. We do a super basic drop duplicates because it shouldn't be a huge + # issue at this point + epc_data = epc_data.drop_duplicates("uprn") # time from the inspection to now epc_data["epc_age"] = (datetime.now() - pd.to_datetime(epc_data["inspection-date"])).dt.days if "estimated" not in epc_data.columns: - epc_data["estimated"] = None + # For all after HA7, we don't use estimated surveys + epc_data["estimated"] = False fuck_this = fml.merge( epc_data, how="left", on="asset_list_row_id" ) + fuck_this["estimated"] = fuck_this["estimated"].fillna(True) if fuck_this.shape[0] != fml.shape[0]: raise Exception("What the fuck bruv") @@ -5259,7 +5264,15 @@ def fml_analysis(loader): ) insulation_thicknesses = pd.DataFrame(insulation_thicknesses) + before_merge_shape = fuck_this.shape[0] fuck_this = fuck_this.merge(insulation_thicknesses, how="left", on="uprn") + + if fuck_this.shape[0] != before_merge_shape: + raise Exception("SOMETHING WENT WRONG") + + if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")): + blah + # clean roof insulation fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0") fuck_this["roof_insulation_thickness"] = fuck_this[ @@ -5283,7 +5296,7 @@ def fml_analysis(loader): # # fuck_this['age_lower_bound'] = fuck_this['construction-age-band'].apply(extract_lower_bound) - had_survey = fuck_this[pd.isnull(fuck_this["estimated"])] + had_survey = fuck_this[fuck_this["estimated"] == False] # proportion with a survey: proportion_with_survey = 100 * had_survey.shape[0] / fuck_this.shape[0] @@ -5294,27 +5307,11 @@ def fml_analysis(loader): had_survey["ECO Eligibility"] == "eco4" ] - # Walls: - # Cavity wall, as built, insulated (assumed) - # Cavity wall, as built, no insulation (assumed) - # Cavity wall, as built, partial insulation (assumed) - - # Roof: - # Less than 100mm = high confidence - # Less than 270mm & EPC at least 5 years old = medium confidence - # Otherwise, low confidence - - # SAP criteria is EPC C or below - - # Pre is 54 or below - - no_ciga_check_needed_with_archetype = no_ciga_check_needed[ + no_ciga_check_needed_eligible = no_ciga_check_needed[ (no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) & (no_ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] - if not no_ciga_check_needed_with_archetype.empty: - raise Exception("SORT ME OUT") # Characterise no CIGA check needed @@ -5327,9 +5324,20 @@ def fml_analysis(loader): ciga_check_passed = had_survey[ had_survey["ECO Eligibility"] == "eco4 - passed ciga" ] + # These should be treated the same as one that have passed their ciga checks, from a detection perspective + ciga_check_passed_eligible = ciga_check_passed[ + (ciga_check_passed["walls-description"].str.lower().str.contains("cavity") == True) & + (ciga_check_passed["roof_classiciation"].isin(["high", "medium"])) & + (ciga_check_passed["current-energy-efficiency"].astype(float) <= 80) + ] - if not ciga_check_passed.empty: - raise Exception("SORT ME BRUV") + if not loader.data[ha_name]["ciga_list"].empty: + + proportions = loader.data[ha_name]["ciga_list"]["Guarantee"].value_counts(normalize=True) + ha_ciga_pass_rate = proportions[proportions.index == "No"].values[0] + + else: + ha_ciga_pass_rate = assumed_ciga_pass_rate # We take just the cavity walls # UCL paper: https://discovery.ucl.ac.uk/id/eprint/10110371/ @@ -5338,53 +5346,96 @@ def fml_analysis(loader): # differ between variables; floor and wall type errors occur in ~10-15% of EPCs, # compared with ~5% for wall insulation and glazing performance - ciga_check_needed_plausible = ciga_check_needed[ + ciga_check_needed_eligible = ciga_check_needed[ (ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) & (ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] - if not loader.data[ha_name]["ciga_list"].empty: - raise NotImplementedError("SORT OUT THE CIGA BRUV") - else: - ha_ciga_pass_rate = assumed_ciga_pass_rate - - ciga_check_expectation = np.round(ciga_check_needed_plausible.shape[0] * ha_ciga_pass_rate) - without_ciga_expectation = no_ciga_check_needed_with_archetype.shape[0] + ciga_check_expectation = np.round(ciga_check_needed_eligible.shape[0] * ha_ciga_pass_rate) + without_ciga_expectation = no_ciga_check_needed_eligible.shape[0] + passed_ciga_expectation = ciga_check_passed_eligible.shape[0] # Need to add on the non-ciga - total_expectation = ciga_check_expectation + without_ciga_expectation + total_expectation = ciga_check_expectation + without_ciga_expectation + passed_ciga_expectation if proportion_with_survey < 100: # We estimate the rest without_survey_needing_ciga = fuck_this[ - (pd.isnull(fuck_this["estimated"]) == False) & + (fuck_this["estimated"] == True) & (fuck_this["ECO Eligibility"].str.contains("subject to ciga") == True) ] - # We apply the same conversion rate as the properties with a survey - without_survey_without_ciga_expected = np.round( - without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0]) - ) + if without_survey_needing_ciga.empty: + without_survey_without_ciga_expected = 0 + else: + # We apply the same conversion rate as the properties with a survey + without_survey_without_ciga_expected = np.round( + without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0]) + ) - total_expectation += without_survey_without_ciga_expected - - without_survey_without_ciga = fuck_this[ - (pd.isnull(fuck_this["estimated"]) == False) & (fuck_this["ECO Eligibility"].isin(["eco4"])) + without_survey_passed_ciga = fuck_this[ + (fuck_this["estimated"] == True) & + (fuck_this["ECO Eligibility"] == "eco4 - passed ciga") ] - if not without_survey_without_ciga.empty: - raise Exception("Estimate the rest!!") + if without_survey_passed_ciga.empty: + without_survey_passed_ciga_expected = 0 + else: + # We apply the same conversion rate as the properties with a survey + without_survey_passed_ciga_expected = np.round( + without_survey_passed_ciga.shape[0] * (passed_ciga_expectation / ciga_check_passed.shape[0]) + ) + + # Finally, no ciga needed + without_survey_eco4 = fuck_this[ + (fuck_this["estimated"] == True) & + (fuck_this["ECO Eligibility"] == "eco4") + ] + + if without_survey_eco4.empty: + without_survey_eco4_expected = 0 + else: + # We apply the same conversion rate as the properties with a survey + without_survey_eco4_expected = np.round( + without_survey_eco4.shape[0] * (without_ciga_expectation / no_ciga_check_needed.shape[0]) + ) + + total_expectation = ( + total_expectation + + without_survey_without_ciga_expected + + without_survey_passed_ciga_expected + + without_survey_eco4_expected + ) + + surveys = loader.data[ha_name]["survey_list"] + sold_now = 0 + if not surveys.empty: + sold_now = surveys[ + surveys["installation_status"].str.lower().str.contains("eco4") + ].shape[0] + + sales_since_nov = sold_now - original_figures["No. of Tech surveys complete - Eco 4"].values[0] results.append( { "HA Name": ha_name, "Original ECO4 Estimate - Remaining": original_remaining, + "Of which sold": sales_since_nov, + "Of which ECO4 Eligible - Remaining": int(total_expectation), "Proportion with a survey": proportion_with_survey, - "total_expectation": total_expectation } ) + results_df = pd.DataFrame(results) + + results_df["Delta vs November"] = 100 * ( + results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"] + ) / results_df["Original ECO4 Estimate - Remaining"] + + # TODO: Split into high and low confidence? + # + def app(): """