diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 40dfd38e..5b1e2f91 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1716,20 +1716,11 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - # Take the duplicated archetypes - duplicated_archetypes = region_surveyed[ - region_surveyed["Archetype ID"].duplicated() - ]["Archetype ID"].unique() - duplicated_archetypes = region_surveyed[ - region_surveyed["Archetype ID"].isin(duplicated_archetypes) - ] - - # We need to select which one is the most relevant to these properties - survey_data = survey_results_with_original_features[ - survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values) - ] - - raise NotImplementedError("Fix me") + region_surveyed = survey_results[ + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() + region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc) region_assets = region_assets.merge( region_surveyed, @@ -1744,6 +1735,17 @@ def propsed_wave_3_sample(): pd.isnull(region_assets["Confidence Tier"]), "1 - Archetype surveyed", region_assets["Confidence Tier"] ) + + region_assets["Current EPC Band"] = np.where( + pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]), + region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"] + ) + # Handle EPC C + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + "6 - EPC C or above", region_assets["Confidence Tier"] + ) + region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) # TODO: Turn into a function missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) @@ -1752,36 +1754,16 @@ def propsed_wave_3_sample(): survey_results["Archetype ID"].isin(missed_archetypes) ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + # TODO - We could average the property?? And call it borderline, call out it was averaged!!! + # We could also find the nearest property to it, with similar wall, roof, heating? + # Can use long/lag to distance calc. We have this data from previous + if archetype_surveyed["Archetype ID"].duplicated().sum(): - # We need to select which one is the most relevant to these properties - duplicated_archetypes = archetype_surveyed[ - archetype_surveyed["Archetype ID"].duplicated() - ]["Archetype ID"].unique() - - survey_data = survey_results_with_original_features[ - survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes) - ] - - homes_with_these_archetypes = region_assets[ - region_assets["Archetype ID"].isin(duplicated_archetypes) - ] - - for _, home in homes_with_these_archetypes.iterrows(): - first_filter = survey_data[ - (survey_data["Postal Region"] == home["Postal Region"]) & - (survey_data["Property Type"] == home["Property Type"]) & - (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) - ] - - if not first_filter.empty: - NotImplementedError("Fix me 0") - - second_filter = survey_data[ - (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) & - (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) - ] - - raise NotImplementedError("Fix me 2") + archetype_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() + archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc) + archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"]) region_assets = region_assets.merge( archetype_surveyed,