From 1b38832e27abcbebe575f4be867a41e4ae772949 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 20:13:19 +0000 Subject: [PATCH] 2044 properties added --- .../stonewater/Wave 3 Preparation.py | 148 ++++++++++++++---- 1 file changed, 117 insertions(+), 31 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 460aa8ee..6f98c9fd 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1938,6 +1938,27 @@ def propsed_wave_3_sample(): ) ].copy() + if surveyed.empty: + if property["Property Type"].split(":")[0] in ["House", "Bungalow", "Maisonette"]: + filter_property_types = ["House", "Bungalow", ] + else: + filter_property_types = ["Flat"] + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + ) + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) + ].copy() + if "Electric" in property["Heating"]: # Take other electric heating systems surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] @@ -1950,6 +1971,9 @@ def propsed_wave_3_sample(): elif property["Heating"] == "Solid fuel room heaters: Open fire in grate": # Take other properties with room heaters surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")] + elif "Boiler" in property["Heating"]: + # Take other properties with boilers + surveyed = surveyed[surveyed["Heating"].str.contains("Boiler")] else: raise Exception("Fix me") @@ -1972,17 +1996,29 @@ def propsed_wave_3_sample(): # Check if we have a postcode match check if surveyed postcode is the same as the property postcode if any(surveyed["Postcode"] == property["Postcode"]): - surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]] + surveyed = surveyed[surveyed["Postcode"] == property["Postcode"]] if any(surveyed["Postal Region"] == property["Postal Region"]): - surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]] + surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]] # Take the 5 nearest - surveyed_similar = surveyed_similar.head(5) + surveyed = surveyed.head(5) + + # # We allow a max distance of 10km + # surveyed = surveyed[surveyed["distance_meters"] < 10000] + # if surveyed.empty: + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": "4 - no similar property, needs survey to confirm", + # "Current EPC Band": "Needs Survey" + # } + # ) + # continue # perform a weighted mean of SAP rating - the closer the better expected_sap = np.average( - surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1) + surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1) ) expected_epc = sap_to_epc(expected_sap) @@ -2153,23 +2189,21 @@ def propsed_wave_3_sample(): # '1 - Archetype surveyed', # '1 - property was surveyed', # '2 - same archetype', - # '3 - similar property', - # '3 - similar property, all areas searched', - # '3 - similar property, relaxed conditions' + # '3 - similar property, weighted on distance' + + gain_columns = [ + '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', + '3 - similar property, weighted on distance' + ] # # Loss is the sum of these columns: # '4 - no similar property, needs survey to confirm', # '5 - EPC C or above', '5 - property was surveyed' - geographic_summary["Gain"] = geographic_summary[ - [ - '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property', - '3 - similar property, all areas searched', '3 - similar property, relaxed conditions' - ] - ].sum(axis=1) - geographic_summary["Loss"] = geographic_summary[ - ['5 - EPC C or above', '5 - property was surveyed'] - ].sum(axis=1) + loss_columns = ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', + '5 - property was surveyed'] + geographic_summary["Gain"] = geographic_summary[gain_columns].sum(axis=1) + geographic_summary["Loss"] = geographic_summary[loss_columns].sum(axis=1) print(geographic_summary.sum()) @@ -2180,30 +2214,82 @@ def propsed_wave_3_sample(): loss = geographic_summary["Loss"].values gain = geographic_summary["Gain"].values - # Define the coefficients for the objective function (negative because we maximize Gain) - c = -gain + def optimise(gain, loss, max_loss=250): - # Define constraints - A = [loss] # Only 1 constraint for now, total Loss - b = [250] # Maximum total Loss allowed + # Define the coefficients for the objective function (negative because we maximize Gain) + c = -gain - # Bounds for each variable (select or not select each row, 0 <= x <= 1) - bounds = [(0, 1) for _ in gain] + # Define constraints + A = [loss] # Only 1 constraint for now, total Loss + b = [max_loss] # Maximum total Loss allowed - # Solve the problem using linprog with HiGHS solver - result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') - if not result.success: - raise Exception("Optimization failed") + # Bounds for each variable (select or not select each row, 0 <= x <= 1) + bounds = [(0, 1) for _ in gain] - selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 - optimal_gain = -result.fun - print(optimal_gain) + # Solve the problem using linprog with HiGHS solver + result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') + if not result.success: + raise Exception("Optimization failed") + + selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 + optimal_gain = -result.fun + + return selected_rows, optimal_gain + + selected_rows, _ = optimise(gain, loss, 250) # Select the rows that are selected geographic_summary["Selected"] = selected_rows == 1 geographic_summary[geographic_summary["Selected"]].sum() - bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum() + + region_totals = geographic_summary[ + geographic_summary["Selected"] + ][["Gain", "Loss"]].sum() + + # We now see if there are any postcodes that have no loss that can be added + unselected_regions = geographic_summary[~geographic_summary["Selected"]]["Postal Region"].values + + postcode_summary = results.pivot_table( + index='Postcode', + columns='Confidence Tier', + aggfunc='size', + fill_value=0 + ).reset_index() + postcode_summary = postcode_summary.merge( + results[["Postcode", "Postal Region"]].drop_duplicates(), + how="left", on="Postcode" + ) + + postcode_summary_unselected_regions = postcode_summary[ + postcode_summary["Postcode"].str.split(" ").str[0].isin(unselected_regions) + ].copy() + + postcode_summary_unselected_regions["Gain"] = postcode_summary_unselected_regions[gain_columns].sum(axis=1) + postcode_summary_unselected_regions["Loss"] = postcode_summary_unselected_regions[loss_columns].sum(axis=1) + + # Remaining loss allowed + remaining_loss_constraint = 250 - region_totals["Loss"] + postcode_selected_rows, _ = optimise( + gain=postcode_summary_unselected_regions["Gain"].values, + loss=postcode_summary_unselected_regions["Loss"].values, + max_loss=int(remaining_loss_constraint) + ) + + postcode_summary_unselected_regions["Selected"] = postcode_selected_rows == 1 + postcode_summary_unselected_regions[postcode_summary_unselected_regions["Selected"]][["Gain", "Loss"]].sum() + + postcode_optimised_additional_properties = postcode_summary_unselected_regions[ + postcode_summary_unselected_regions["Selected"] + ] + + postcode_totals = postcode_optimised_additional_properties[["Gain", "Loss"]].sum() + + bid_size = region_totals.sum() + postcode_totals.sum() print("Bid Size:", bid_size) + total_epc_d_or_below = region_totals["Gain"] + postcode_totals["Gain"] + print("Total EPC D or below:", total_epc_d_or_below) + total_epc_c = region_totals["Loss"] + postcode_totals["Loss"] + print("Total EPC C or above:", total_epc_c) # if __name__ == "__main__": # main()