From b6c57c7253ec86b59ef1599489a405a9466ce505 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 26 Jan 2024 17:17:43 +0000 Subject: [PATCH] created template of code to create the ha analysis results --- etl/eligibility/Eligibility.py | 6 +- .../ha_15_32/ha_analysis_batch_3.py | 242 +++++++++++++++--- 2 files changed, 207 insertions(+), 41 deletions(-) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 00c72a8e..1d868338 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -177,15 +177,13 @@ class Eligibility: is_empty = (not self.walls["is_filled_cavity"]) or ( self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["average", "above average"] ) - is_partial_filled = ( - self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["below average"] - ) + is_partial_filled = "partial" in self.walls["clean_description"].lower() # We look for potentially under performing cavities - anything that is assumed, as built and insulated is_underperforming = ( self.walls["is_as_built"] and self.walls["insulation_thickness"] in ["average"] and self.walls["is_assumed"] ) - is_unfilled_cavity = is_cavity and is_empty + is_unfilled_cavity = is_cavity and (is_empty and not is_partial_filled) is_partial_filled_cavity = is_cavity and is_partial_filled is_underperforming_cavity = is_cavity and is_underperforming diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1212522e..1ed95a30 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -901,6 +901,7 @@ def analyse_ha_data(outputs, loader): :return: """ + ha_analysis_results = [] for ha_name, datasets in outputs.items(): inputs = [x for k, x in loader.data.items() if k == ha_name][0] @@ -917,9 +918,20 @@ def analyse_ha_data(outputs, loader): inputs["asset_list"]["funding_scheme"] ) + # TODO: Also temp, just for HA 6 + if ha_name == "ha_6": + inputs["survey_list"]["funding_scheme"] = None + inputs["survey_list"]["funding_scheme"] = np.where( + inputs["survey_list"][ + 'AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION '] == "AFFORDABLE WARMTH", + "ECO4", + "GBIS" + ) + # End placholder results_df = datasets["results_df"].copy() + analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename( columns={"row_meaning": "asset_identification_status"} ).merge( @@ -929,23 +941,6 @@ def analyse_ha_data(outputs, loader): left_on="asset_list_row_id" ) - # If we have a survey list, we merge this onto the results - - n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique() - - properties_sold = ( - inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if - inputs["survey_list"] is not None else 0 - ) - properties_sold_eco4 = ( - properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if - properties_sold != 0 else 0 - ) - properties_sold_gbis = ( - properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if - properties_sold != 0 else 0 - ) - # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is # remaining @@ -956,8 +951,23 @@ def analyse_ha_data(outputs, loader): # Drop any rows that have a survey_list_row_id analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])] + # If we have a survey list, we merge this onto the results + n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique() + + properties_sold = ( + inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if + inputs["survey_list"] is not None else pd.DataFrame(columns=["funding_scheme"]) + ) + properties_sold_eco4 = ( + properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if + (not properties_sold.empty) and ("ECO4" in properties_sold["funding_scheme"].values) else 0 + ) + properties_sold_gbis = ( + properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if + (not properties_sold.empty) and ("GBIS" in properties_sold["funding_scheme"].values) else 0 + ) + # We now calculate the number of remaining properties, by scheme - # TODO: We might need to tweak a bit of the logic remaining_properties = analysis_data[ analysis_data["asset_identification_status"] == "identified potential eco works (CWI)" ].copy() @@ -966,6 +976,7 @@ def analyse_ha_data(outputs, loader): remaining_properties_by_scheme = ( remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index() ) + n_remaining_properties_eco4 = remaining_properties_by_scheme[ remaining_properties_by_scheme["funding_scheme"] == "ECO4" ]["asset_list_row_id"].values[0] @@ -983,13 +994,17 @@ def analyse_ha_data(outputs, loader): # - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties # here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have # very old EPCs which may score lower when re-done - # 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity. + # 2) Meets Fabric requirements, not SAP + # Warmfront has identified the property as eligible, but the EPC is not D or below. We consider this but + # label is separately as not a strict + # 3) Subject to CIGA check - Meets loft conditions but shows a filled cavity. # - we don't have a SAP constraint here because the EPC is (currently) showing what the property might # actually look like after retrofit and so the EPC currently being a C or above means little, because # the updated EPC, showing an empty cavity, could bring the property within - # 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation. + # 4) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation. # - No SAP constraint, for the same reason as in category 2) - # 4) Does not look like ECO4 candidate + # 5) Looks like GBIS instead + # 6) Does not look like ECO4 candidate # # For properties that have been identified as GBIS # 1) Strict GBIS candidates @@ -1000,43 +1015,156 @@ def analyse_ha_data(outputs, loader): remaining_eco4_df = remaining_properties[ remaining_properties["funding_scheme"] == "ECO4" ].copy() + + #################################### # ECO4 + #################################### + # 1) We identify this if: # - remaining_properties["eco4_eligible"] == True remaining_eco4_df["prospect_type"] = np.where( - remaining_eco4_df["eco4_eligible"] == True, + (remaining_eco4_df["eco4_eligible"] == True), "strict ECO4", remaining_eco4_df["prospect_type"] ) - # 2) We identify this if it has a filled cavity but meets the loft conditions + # 2) Meets fabric requirements + remaining_eco4_df["prospect_type"] = np.where( + ( + (remaining_eco4_df["eco4_message"] == "sap too high") & + remaining_eco4_df["eligibility_cavity_type"].isin(["partial", "empty"]) & + remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) + ), + "ECO4 if SAP downgrade", + remaining_eco4_df["prospect_type"] + ) - remaining_eco4_df["prospect_type"] + # 3) We identify this if it has a filled cavity but meets the loft conditions + # TODO: Consider if we should also allow 100-270mm or if we should add some slight tolerance (e.g. 150mm) + # to account for measurement error + remaining_eco4_df["prospect_type"] = np.where( + ( + remaining_eco4_df["eligibility_cavity_type"].isin(["full"]) & + remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) + ), + "Filled cavity - subject to CIGA check", + remaining_eco4_df["prospect_type"] + ) - z = remaining_eco4_df[remaining_eco4_df["eco4_message"] == "sap too high"] + # 4) We identify this by ensuring the cavity if empty or partial, and the loft has between 101 and 270mm + remaining_eco4_df["prospect_type"] = np.where( + ( + remaining_eco4_df["eligibility_cavity_type"].isin(["empty", "partial"]) & + remaining_eco4_df["eligibility_loft_type"].isin(["100-270mm"]) + ), + "ECO4 prospect - empty cavity, loft insulation below regulation", + remaining_eco4_df["prospect_type"] + ) - remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts() - remaining_properties["eco4_message"].value_counts() - z = remaining_properties[ - (remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") & - (remaining_properties["eco4_eligible"] == True) - ] + # 5) Looks like GBIS instead + remaining_eco4_df["prospect_type"] = np.where( + (remaining_eco4_df["gbis_eligible"] == True), + "Looks like GBIS", + remaining_eco4_df["prospect_type"] + ) - k = z[z["property_type"] == "Flat"] - k["uprn"] + # 6) This is everything else (i.e. both the cavity is full and the loft insulation is above 100mm) + remaining_eco4_df["prospect_type"] = remaining_eco4_df["prospect_type"].fillna( + "Does not look like ECO4 candidate" + ) - ha_analysis_results = { + #################################### + # GBIS + #################################### + + remaining_gbis = remaining_properties[ + remaining_properties["funding_scheme"] == "GBIS" + ].copy() + + # 1) Strict GBIS candidates + remaining_gbis["prospect_type"] = np.where( + ( + (remaining_gbis["gbis_eligible"] == True) & (remaining_gbis["eco4_eligible"] == False) + ), + "strict GBIS", + remaining_gbis["prospect_type"] + ) + + # 2) GBIS candidates that look like strict ECO4 candidates + remaining_gbis["prospect_type"] = np.where( + (remaining_gbis["eco4_eligible"] == True), + "Upgradable to ECO4", + remaining_gbis["prospect_type"] + ) + + # 3) Subject to CIGA check - Filled cavity + remaining_gbis["prospect_type"] = np.where( + ( + remaining_gbis["eligibility_cavity_type"].isin(["full"]) + ), + "Filled cavity - subject to CIGA check", + remaining_gbis["prospect_type"] + ) + + # 4) Everything else + remaining_gbis["prospect_type"] = remaining_gbis["prospect_type"].fillna( + "Does not look like GBIS candidate" + ) + + #################################### + # Surplus properties + #################################### + + # Take properties that were not identified by Warmfront and identify those that look like they would qualify + # under the strictest criteria + surplus_df = analysis_data[ + analysis_data["asset_identification_status"] != "identified potential eco works (CWI)" + ].copy() + + eco4_surplus = surplus_df[ + ( + (surplus_df["eco4_eligible"] == True) & (surplus_df["eco4_message"] == "subject to post retrofit sap") & + ( + surplus_df["eligibility_classification"].isin( + ["high confidence", "highest confidence", "medium confidence"] + ) + ) + ) + ].copy() + + gbis_surplus = surplus_df[ + ( + (surplus_df["gbis_eligible"] == True) & (surplus_df["eco4_eligible"] == False) & ( + surplus_df["eligibility_cavity_type"].isin(["empty", "partial"]) + ) + ) + ].copy() + + ha_analysis_results.append({ "n_properties_in_asset_list": n_properties_in_asset_list, + ############ # ECO4 + ############ "properties_sold_eco4": properties_sold_eco4, "n_remaining_properties_eco4": n_remaining_properties_eco4, + **remaining_eco4_df["prospect_type"].value_counts().to_dict(), + ############ # GBIS + ############ "properties_sold_gbis": properties_sold_gbis, - "n_remaining_properties_gbis": n_remaining_properties_gbis - } + "n_remaining_properties_gbis": n_remaining_properties_gbis, + **remaining_gbis["prospect_type"].value_counts().to_dict(), + ############ + # GBIS + ############ + "n_eco4_surplus": eco4_surplus.shape[0], + "n_gbis_surplus": gbis_surplus.shape[0], + }) - pass + ha_analysis_results = pd.DataFrame(ha_analysis_results) + + # Todo: create revenue figures and automate creation of excel def app(): @@ -1152,3 +1280,43 @@ def app(): outputs = get_epc_data( loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=False ) + + # for ha_name, datasets in outputs.items(): + # datasets["results_df"] = datasets["results_df"].drop( + # columns=["eligibility_cavity_type", "eligibility_loft_type"] + # ) + # + # # Re-do + # res = [] + # for _, row in tqdm(datasets["results_df"].iterrows(), total=datasets["results_df"].shape[0]): + # epc = { + # "walls-description": row["walls"], + # "roof-description": row["roof"], + # "floor-description": "", + # "tenure": "", + # "current-energy-efficiency": row["sap"], + # } + # eligibility = Eligibility(epc=epc, cleaned=cleaned) + # eligibility.check_eco4_warmfront() + # res.append( + # { + # "row_id": row["row_id"], + # "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"], + # "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"] + # } + # ) + # + # # Merge back on + # res = pd.DataFrame(res) + # datasets["results_df"] = datasets["results_df"].merge(res, how="left", on="row_id") + # + # # Re-save in s3 + # save_pickle_to_s3( + # data={ + # "results_df": datasets["results_df"], + # "scoring_df": datasets["scoring_df"], + # "nodata": datasets["nodata"] + # }, + # bucket_name="retrofit-datalake-dev", + # s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" + # )