From 7d63c164045c6855ea6cb13091788a2ed7db2afb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 18:05:05 +0000 Subject: [PATCH] implemented linear programming to find maximal bid size --- .../stonewater/Wave 3 Preparation.py | 71 ++++++++++++++++--- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 20f771ec..c397f962 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,9 +3,9 @@ import PyPDF2 import re import pandas as pd import numpy as np -from docutils.utils.math.tex2mathml_extern import blahtexml from tqdm import tqdm from collections import Counter +from scipy.optimize import linprog CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") @@ -1843,13 +1843,38 @@ def propsed_wave_3_sample(): ] if surveyed_similar.empty: - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Unknown" - } - ) + + # We get an average based on the postcode + surveyed_similar = survey_results_with_original_features[ + (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + )) + ] + if surveyed_similar.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "4 - no similar property, needs survey to confirm", + "Current EPC Band": "Unknown" + } + + ) + else: + expected_sap = surveyed_similar["Current SAP Rating"].mean() + expected_epc = sap_to_epc(expected_sap) + if expected_epc in ["C", "B", "A"]: + tier = "5 - EPC C or above" + else: + tier = "3 - similar property, relaxed conditions" + + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": tier, + "Current EPC Band": expected_epc + } + ) continue # We take an average expected_sap = surveyed_similar["Current SAP Rating"].mean() @@ -1922,5 +1947,35 @@ def propsed_wave_3_sample(): geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() + geographic_summary[["Loss", "Gain"]].head() + + loss = geographic_summary["Loss"].values + gain = geographic_summary["Gain"].values + + # Define the coefficients for the objective function (negative because we maximize Gain) + c = -gain + + # Define constraints + A = [loss] # Only 1 constraint for now, total Loss + b = [250] # Maximum total Loss allowed + + # Bounds for each variable (select or not select each row, 0 <= x <= 1) + bounds = [(0, 1) for _ in gain] + + # Solve the problem using linprog with HiGHS solver + result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') + if not result.success: + raise Exception("Optimization failed") + + selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 + optimal_gain = -result.fun + print(optimal_gain) + + # Select the rows that are selected + geographic_summary["Selected"] = selected_rows == 1 + geographic_summary[geographic_summary["Selected"]].sum() + bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum() + print("Bid Size:", bid_size) + # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 3ad5d2c1..09ba20bd 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -7,4 +7,5 @@ epc-api-python==1.0.2 usaddress==0.5.11 fuzzywuzzy==0.18.0 python-dotenv +scipy