From 7d63c164045c6855ea6cb13091788a2ed7db2afb Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 Nov 2024 18:05:05 +0000
Subject: [PATCH] implemented linear programming to find maximal bid size

---
 .../stonewater/Wave 3 Preparation.py          | 71 ++++++++++++++++---
 .../requirements/requirements-wave-3-prep.txt |  1 +
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 20f771ec..c397f962 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -3,9 +3,9 @@ import PyPDF2
 import re
 import pandas as pd
 import numpy as np
-from docutils.utils.math.tex2mathml_extern import blahtexml
 from tqdm import tqdm
 from collections import Counter
+from scipy.optimize import linprog
 
 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
 SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
@@ -1843,13 +1843,38 @@ def propsed_wave_3_sample():
                     ]
 
             if surveyed_similar.empty:
-                final_missed_matches.append(
-                    {
-                        "Address ID": a_id,
-                        "Confidence Tier": "4 - no similar property, needs survey to confirm",
-                        "Current EPC Band": "Unknown"
-                    }
-                )
+
+                # We get an average based on the postcode
+                surveyed_similar = survey_results_with_original_features[
+                    (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
+                    (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+                        filter_property_types
+                    ))
+                    ]
+                if surveyed_similar.empty:
+                    final_missed_matches.append(
+                        {
+                            "Address ID": a_id,
+                            "Confidence Tier": "4 - no similar property, needs survey to confirm",
+                            "Current EPC Band": "Unknown"
+                        }
+
+                    )
+                else:
+                    expected_sap = surveyed_similar["Current SAP Rating"].mean()
+                    expected_epc = sap_to_epc(expected_sap)
+                    if expected_epc in ["C", "B", "A"]:
+                        tier = "5 - EPC C or above"
+                    else:
+                        tier = "3 - similar property, relaxed conditions"
+
+                    final_missed_matches.append(
+                        {
+                            "Address ID": a_id,
+                            "Confidence Tier": tier,
+                            "Current EPC Band": expected_epc
+                        }
+                    )
                 continue
             # We take an average
             expected_sap = surveyed_similar["Current SAP Rating"].mean()
@@ -1922,5 +1947,35 @@ def propsed_wave_3_sample():
     geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
     geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()
 
+    geographic_summary[["Loss", "Gain"]].head()
+
+    loss = geographic_summary["Loss"].values
+    gain = geographic_summary["Gain"].values
+
+    # Define the coefficients for the objective function (negative because we maximize Gain)
+    c = -gain
+
+    # Define constraints
+    A = [loss]  # Only 1 constraint for now, total Loss
+    b = [250]  # Maximum total Loss allowed
+
+    # Bounds for each variable (select or not select each row, 0 <= x <= 1)
+    bounds = [(0, 1) for _ in gain]
+
+    # Solve the problem using linprog with HiGHS solver
+    result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs')
+    if not result.success:
+        raise Exception("Optimization failed")
+
+    selected_rows = result.x.round().astype(int)  # Rounded to 0 or 1
+    optimal_gain = -result.fun
+    print(optimal_gain)
+
+    # Select the rows that are selected
+    geographic_summary["Selected"] = selected_rows == 1
+    geographic_summary[geographic_summary["Selected"]].sum()
+    bid_size = geographic_summary[geographic_summary["Selected"]][["Gain", "Loss"]].sum().sum()
+    print("Bid Size:", bid_size)
+
 # if __name__ == "__main__":
 #     main()
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
index 3ad5d2c1..09ba20bd 100644
--- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -7,4 +7,5 @@ epc-api-python==1.0.2
 usaddress==0.5.11
 fuzzywuzzy==0.18.0
 python-dotenv
+scipy