From 93723697a18aeed93ef9d784fae9fff477cf62e8 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 27 Oct 2025 15:27:32 +0000
Subject: [PATCH] allow no valuation and work with relative

---
 .idea/Model.iml                |   2 +-
 .idea/misc.xml                 |   2 +-
 asset_list/app.py              | 110 ++++++++++++++++++++++++++-
 backend/SearchEpc.py           |   5 +-
 backend/engine/engine.py       |  46 +++++++++---
 backend/ml_models/Valuation.py |  15 +++-
 etl/webscrape/Zoopla.py        | 133 +++++++++++++++++++++++++--------
 etl/webscrape/requirements.txt |   5 ++
 8 files changed, 265 insertions(+), 53 deletions(-)
 create mode 100644 etl/webscrape/requirements.txt
diff --git a/.idea/Model.iml b/.idea/Model.iml
index 09f2e496..c6561970 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index fb10c6b0..50cad4ca 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/asset_list/app.py b/asset_list/app.py
index 2903e083..20cf04f1 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -59,9 +59,111 @@ def app():
     Property UPRN
     """
 
+    #
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cambridge/"
+    data_filename = "22.10_Cambridge_west addresses.xlsx"
+    sheet_name = "Asset List"
+    postcode_column = 'Postcode'
+    address1_column = None
+    address1_method = "house_number_extraction"
+    fulladdress_column = "Full Address"
+    address_cols_to_concat = []
+    missing_postcodes_method = None
+    landlord_year_built = None
+    landlord_os_uprn = None
+    landlord_property_type = None
+    landlord_built_form = None
+    landlord_wall_construction = None
+    landlord_roof_construction = None
+    landlord_heating_system = None
+    landlord_existing_pv = None
+    landlord_property_id = "id"
+    landlord_sap = None
+    outcomes_filename = None
+    outcomes_sheetname = None
+    outcomes_postcode = None
+    outcomes_houseno = None
+    outcomes_id = None
+    outcomes_address = None
+    master_filepaths = []
+    master_id_colnames = []
+    master_to_asset_list_filepath = None
+    phase = False
+    ecosurv_landlords = None
+    asset_list_header = 0
+    landlord_block_reference = None
+
+    # Property Box
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box"
+    data_filename = "Property Box Finance Portfolio.xlsx"
+    sheet_name = "Sheet1"
+    postcode_column = 'Postcode'
+    address1_column = None
+    address1_method = "house_number_extraction"
+    fulladdress_column = "Address 1"
+    address_cols_to_concat = []
+    missing_postcodes_method = None
+    landlord_year_built = None
+    landlord_os_uprn = None
+    landlord_property_type = None
+    landlord_built_form = None
+    landlord_wall_construction = None
+    landlord_roof_construction = None
+    landlord_heating_system = None
+    landlord_existing_pv = None
+    landlord_property_id = "row_id"
+    landlord_sap = None
+    outcomes_filename = None
+    outcomes_sheetname = None
+    outcomes_postcode = None
+    outcomes_houseno = None
+    outcomes_id = None
+    outcomes_address = None
+    master_filepaths = []
+    master_id_colnames = []
+    master_to_asset_list_filepath = None
+    phase = False
+    ecosurv_landlords = None
+    asset_list_header = 0
+    landlord_block_reference = "block_id"
+
+    # CDS - able-to-pay
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/CDS/Able to pay"
+    data_filename = "CDS_ASSET LIST_(2314).xlsx"
+    sheet_name = "Sheet1"
+    postcode_column = 'Property Address - Postcode'
+    address1_column = "Property Address - Line 1"
+    address1_method = None
+    fulladdress_column = "Property Address - Line 1"
+    address_cols_to_concat = []
+    missing_postcodes_method = None
+    landlord_year_built = None
+    landlord_os_uprn = None
+    landlord_property_type = None
+    landlord_built_form = None
+    landlord_wall_construction = None
+    landlord_roof_construction = None
+    landlord_heating_system = None
+    landlord_existing_pv = None
+    landlord_property_id = "row_id"
+    landlord_sap = None
+    outcomes_filename = None
+    outcomes_sheetname = None
+    outcomes_postcode = None
+    outcomes_houseno = None
+    outcomes_id = None
+    outcomes_address = None
+    master_filepaths = []
+    master_id_colnames = []
+    master_to_asset_list_filepath = None
+    phase = False
+    ecosurv_landlords = None
+    asset_list_header = 0
+    landlord_block_reference = None
+
     # Hyde - solar
     data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Solar"
-    data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
+    data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
     sheet_name = "Electric Property Inspections"
     postcode_column = 'Postcode'
     address1_column = None  # Is only patchily populated so we create it
@@ -88,14 +190,14 @@ def app():
     master_filepaths = []
     master_id_colnames = []
     master_to_asset_list_filepath = None
-    phase = True
+    phase = False
     ecosurv_landlords = None
     asset_list_header = 0
     landlord_block_reference = None
 
     # Hyde cavity
     data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Cavity"
-    data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
+    data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
     sheet_name = "Cavity Inspections"
     postcode_column = 'Postcode'
     address1_column = None  # Is only patchily populated so we create it
@@ -122,7 +224,7 @@ def app():
     master_filepaths = []
     master_id_colnames = []
     master_to_asset_list_filepath = None
-    phase = True
+    phase = False
     ecosurv_landlords = None
     asset_list_header = 0
     landlord_block_reference = None
diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index 16dd8f04..1a14e87a 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -347,7 +347,8 @@ class SearchEpc:
                 # We update the data with the correct uprn
                 if self.uprn:
                     for x in api_response["response"]["rows"]:
-                        x["uprn"] = self.uprn
+                        if pd.isnull(x["uprn"]):
+                            x["uprn"] = self.uprn
 
                 data["rows"].extend(api_response["response"]["rows"])
 
@@ -357,6 +358,8 @@ class SearchEpc:
             row for row in data["rows"]
             if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
         ]
+        # Overwrite the data
+        self.data = data
 
         if data["rows"]:
             api_response["msg"] = self.SUCCESS
diff --git a/backend/engine/engine.py b/backend/engine/engine.py
index fa1f191c..f2674290 100644
--- a/backend/engine/engine.py
+++ b/backend/engine/engine.py
@@ -145,14 +145,17 @@ def extract_portfolio_aggregation_data(
         cost = sum([r["total"] for r in default_recommendations])
         sap_point_improvement = sum([r["sap_points"] for r in default_recommendations])
 
-        lower_bound_valuation_uplift = (
-            property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
-            property_value_increase_ranges[p.id]["current_value"]
-        )
-        upper_bound_valuation_uplift = (
-            property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
-            property_value_increase_ranges[p.id]["current_value"]
-        )
+        if not pd.isnull(property_value_increase_ranges[p.id]["current_value"]):
+            lower_bound_valuation_uplift = (
+                property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
+                property_value_increase_ranges[p.id]["current_value"]
+            )
+            upper_bound_valuation_uplift = (
+                property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
+                property_value_increase_ranges[p.id]["current_value"]
+            )
+        else:
+            lower_bound_valuation_uplift, upper_bound_valuation_uplift = 0, 0
 
         agg_data.append({
             "pre_retrofit_epc": p.data["current-energy-rating"],
@@ -523,6 +526,7 @@ async def model_engine(body: PlanTriggerRequest):
                 plan_input["built_form"] = plan_input["built_form"].map(built_form_map)
 
                 plan_input = plan_input.to_dict("records")
+
             else:
                 raise ValueError("Other formats not yet supported")
 
@@ -549,6 +553,13 @@ async def model_engine(body: PlanTriggerRequest):
         # If we have patches or overrides, we should read them in here
         patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body)
 
+        if body.file_type == "xlsx" and body.file_format == "domna_asset_list":
+            # We check if we have valution data
+            if not valuation_data and body.valuation_file_path in [None, ""]:
+                # We check plan_input
+                if "domna_valuation" in plan_input[0]:
+                    valuation_data = [{"uprn": x["uprn"], "valuation": x["domna_valuation"]} for x in plan_input]
+
         cleaning_data = read_dataframe_from_s3_parquet(
             bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
         )
@@ -563,12 +574,22 @@ async def model_engine(body: PlanTriggerRequest):
             if uprn:
                 uprn = int(float(uprn))
 
+            address1 = config.get("address", None)
+            # Handle domna address list format
+            if pd.isnull(address1) and body.file_format == "domna_asset_list":
+                address1 = config.get("domna_full_address", None)
+
+            address1 = str(int(address1)) if isinstance(address1, float) else str(address1)
+
+            full_address = config["domna_full_address"] if body.file_format == "domna_asset_list" else None
+
             epc_searcher = SearchEpc(
-                address1=str(config["address"]),
+                address1=address1,
                 postcode=config["postcode"],
                 uprn=uprn,
                 auth_token=get_settings().EPC_AUTH_TOKEN,
                 os_api_key="",
+                full_address=full_address
             )
             epc_searcher.ordnance_survey_client.built_form = config.get("built_form", None)
             epc_searcher.ordnance_survey_client.property_type = config.get("property_type", None)
@@ -1176,9 +1197,10 @@ async def model_engine(body: PlanTriggerRequest):
 
                     upload_funding(session, p, new_plan_id, recommendations_to_upload)
 
-                    property_valuation_increases.append(
-                        valuations["average_increased_value"] - valuations["current_value"]
-                    )
+                    if valuations["current_value"] > 0:
+                        property_valuation_increases.append(
+                            valuations["average_increased_value"] - valuations["current_value"]
+                        )
 
                 # Commit the session after each batch
                 session.commit()
diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index 8c57900f..17db0dae 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -219,12 +219,19 @@ class PropertyValuation:
         current_epc = property_instance.data["current-energy-rating"]
 
         if not current_value:
+            # In this case, we return a % improvement rather than an absolute
+            relative_improvement = cls.estimate_valuation_improvement(
+                current_value=1,
+                current_epc=current_epc,
+                target_epc=target_epc,
+                total_cost=1
+            )
             return {
                 "current_value": 0,
-                "lower_bound_increased_value": 0,
-                "upper_bound_increased_value": 0,
-                "average_increased_value": 0,
-                "average_increase": 0
+                "lower_bound_increased_value": relative_improvement["lower_bound_increased_value"] - 1,
+                "upper_bound_increased_value": relative_improvement["upper_bound_increased_value"] - 1,
+                "average_increased_value": relative_improvement["average_increased_value"] - 1,
+                "average_increase": relative_improvement["average_increase"]
             }
 
         return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost)
diff --git a/etl/webscrape/Zoopla.py b/etl/webscrape/Zoopla.py
index bb86c759..7b3fd5b6 100644
--- a/etl/webscrape/Zoopla.py
+++ b/etl/webscrape/Zoopla.py
@@ -1,38 +1,111 @@
-# Initial Code
-
-from seleniumbase import SB
+from bs4 import BeautifulSoup
+import pandas as pd
 import time
+from stealth_requests import StealthSession
+import random
+from multiprocessing import Pool
+from tqdm import tqdm
 
-uprns = [
-    100071297618,
-    100080893397,
-    100060778033,
-    200004793081,
-    100071265143,
-    100071297618,
-    100080893397,
-    100060778033,
-    200004793081,
-    100071265143,
-]
+ENGINES = ["safari", "chrome"]
 
-estimate_list = []
 
-for uprn in uprns:
+def scrape_all_estimates(session, url):
+    # Rotate impersonation per request
+    resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
+    page_source = BeautifulSoup(resp.text, "html.parser")
+    estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
+    is_blocked = len(estimates) == 0
+    return estimates, is_blocked
 
-    # Probably can change the timings here
-    time.sleep(5)
-    with SB(uc=True) as sb:
-        sb.uc_open_with_reconnect(
-            f"https://www.zoopla.co.uk/property/uprn/{uprn}/",
-            3,
+
+def parallel_task(url):
+    # No impersonate argument here
+    with StealthSession() as session:
+        estimates, is_blocked = scrape_all_estimates(session, url)
+
+        while is_blocked:
+            print(f"Blocked by Zoopla for URL: {url}")
+            time.sleep(random.uniform(0, 1))
+            estimates, is_blocked = scrape_all_estimates(session, url)
+
+        low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
+        middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
+        high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
+
+    return {
+        "URL": url,
+        "Low Estimate": low_estimate,
+        "Middle Estimate": middle_estimate,
+        "High Estimate": high_estimate,
+    }
+
+
+def parse_price(p):
+    p = p.replace("£", "").strip().lower()
+    if p.endswith("k"):
+        return float(p[:-1]) * 1000
+    elif p.endswith("m"):
+        return float(p[:-1]) * 1_000_000
+    else:
+        return float(p)
+
+
+# def parallel_task(url):
+#     with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
+#         estimates, is_blocked = scrape_all_estimates(session, url)
+#
+#         while is_blocked:
+#             # Will need to wait and retry if blocked by Zoopla
+#             print(f"Blocked by Zoopla for URL: {url}")
+#             sleep_factor = random.uniform(0, 1)  # Random delay to avoid detection
+#             time.sleep(sleep_factor * 1)
+#             estimates, is_blocked = scrape_all_estimates(session, url)
+#
+#         low_estimate = (
+#             estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
+#         )  # Find all span elements with data-testid="low-estimate"
+#         middle_estimate = (
+#             estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
+#         )  # Find all span elements with data-testid="middle-estimate"
+#         high_estimate = (
+#             estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
+#         )  # Find all span elements with data-testid="high-estimate-blurred"
+#
+#     return {
+#         "URL": url,
+#         "Low Estimate": low_estimate,
+#         "Middle Estimate": middle_estimate,
+#         "High Estimate": high_estimate,
+#     }
+
+
+if __name__ == "__main__":
+    # Get a SAL
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
+        "Standardised.xlsx",
+        sheet_name="Standardised Asset List"
+    )
+    asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
+    uprns = asset_list["epc_os_uprn"].tolist()
+    urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
+
+    with Pool(processes=5) as pool:
+        estimates_list = list(
+            tqdm(
+                pool.imap(parallel_task, urls),
+                total=len(urls),
+            )
         )
 
-        soup = sb.get_beautiful_soup()
+    df = pd.DataFrame(estimates_list)
+    # Extract UPRN from URL
+    df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
+    df["valuation"] = df["Middle Estimate"].apply(parse_price)
+    df.to_csv("zoopla_estimates.csv", index=False)
 
-        estimates = soup.find_all("div", {"data-testid": "sale-estimate"})
-        # Can change the way we extract the text here
-        estimate_text = (
-            estimates[-1].find_all("p")[-1].find_all("span")[-1]["aria-label"]
-        )
-        estimate_list.append(estimate_text)
+    df["uprn"] = df["uprn"].astype(int).astype(str)
+
+    asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
+        "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
+    )
diff --git a/etl/webscrape/requirements.txt b/etl/webscrape/requirements.txt
new file mode 100644
index 00000000..4027a224
--- /dev/null
+++ b/etl/webscrape/requirements.txt
@@ -0,0 +1,5 @@
+beautifulsoup4>=4.12.0
+pandas>=2.0.0
+stealth-requests>=1.0.7
+tqdm>=4.65.0
+openpyxl
\ No newline at end of file