allow no valuation and work with relative

2026-07-27 23:35:01 +00:00 · 2025-10-27 15:27:32 +00:00 · 2025-10-27 15:27:32 +00:00 · 93723697a1
commit 93723697a1
parent e5272e2e64
8 changed files with 265 additions and 53 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
--- a/asset_list/app.py
+++ b/asset_list/app.py
@ -59,9 +59,111 @@ def app():
    Property UPRN
    """

+    #
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cambridge/"
+    data_filename = "22.10_Cambridge_west addresses.xlsx"
+    sheet_name = "Asset List"
+    postcode_column = 'Postcode'
+    address1_column = None
+    address1_method = "house_number_extraction"
+    fulladdress_column = "Full Address"
+    address_cols_to_concat = []
+    missing_postcodes_method = None
+    landlord_year_built = None
+    landlord_os_uprn = None
+    landlord_property_type = None
+    landlord_built_form = None
+    landlord_wall_construction = None
+    landlord_roof_construction = None
+    landlord_heating_system = None
+    landlord_existing_pv = None
+    landlord_property_id = "id"
+    landlord_sap = None
+    outcomes_filename = None
+    outcomes_sheetname = None
+    outcomes_postcode = None
+    outcomes_houseno = None
+    outcomes_id = None
+    outcomes_address = None
+    master_filepaths = []
+    master_id_colnames = []
+    master_to_asset_list_filepath = None
+    phase = False
+    ecosurv_landlords = None
+    asset_list_header = 0
+    landlord_block_reference = None
+
+    # Property Box
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box"
+    data_filename = "Property Box Finance Portfolio.xlsx"
+    sheet_name = "Sheet1"
+    postcode_column = 'Postcode'
+    address1_column = None
+    address1_method = "house_number_extraction"
+    fulladdress_column = "Address 1"
+    address_cols_to_concat = []
+    missing_postcodes_method = None
+    landlord_year_built = None
+    landlord_os_uprn = None
+    landlord_property_type = None
+    landlord_built_form = None
+    landlord_wall_construction = None
+    landlord_roof_construction = None
+    landlord_heating_system = None
+    landlord_existing_pv = None
+    landlord_property_id = "row_id"
+    landlord_sap = None
+    outcomes_filename = None
+    outcomes_sheetname = None
+    outcomes_postcode = None
+    outcomes_houseno = None
+    outcomes_id = None
+    outcomes_address = None
+    master_filepaths = []
+    master_id_colnames = []
+    master_to_asset_list_filepath = None
+    phase = False
+    ecosurv_landlords = None
+    asset_list_header = 0
+    landlord_block_reference = "block_id"
+
+    # CDS - able-to-pay
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/CDS/Able to pay"
+    data_filename = "CDS_ASSET LIST_(2314).xlsx"
+    sheet_name = "Sheet1"
+    postcode_column = 'Property Address - Postcode'
+    address1_column = "Property Address - Line 1"
+    address1_method = None
+    fulladdress_column = "Property Address - Line 1"
+    address_cols_to_concat = []
+    missing_postcodes_method = None
+    landlord_year_built = None
+    landlord_os_uprn = None
+    landlord_property_type = None
+    landlord_built_form = None
+    landlord_wall_construction = None
+    landlord_roof_construction = None
+    landlord_heating_system = None
+    landlord_existing_pv = None
+    landlord_property_id = "row_id"
+    landlord_sap = None
+    outcomes_filename = None
+    outcomes_sheetname = None
+    outcomes_postcode = None
+    outcomes_houseno = None
+    outcomes_id = None
+    outcomes_address = None
+    master_filepaths = []
+    master_id_colnames = []
+    master_to_asset_list_filepath = None
+    phase = False
+    ecosurv_landlords = None
+    asset_list_header = 0
+    landlord_block_reference = None
+
    # Hyde - solar
    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Solar"
-    data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
+    data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
    sheet_name = "Electric Property Inspections"
    postcode_column = 'Postcode'
    address1_column = None  # Is only patchily populated so we create it
@ -88,14 +190,14 @@ def app():
    master_filepaths = []
    master_id_colnames = []
    master_to_asset_list_filepath = None
-    phase = True
+    phase = False
    ecosurv_landlords = None
    asset_list_header = 0
    landlord_block_reference = None

    # Hyde cavity
    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Cavity"
-    data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx"
+    data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx"
    sheet_name = "Cavity Inspections"
    postcode_column = 'Postcode'
    address1_column = None  # Is only patchily populated so we create it
@ -122,7 +224,7 @@ def app():
    master_filepaths = []
    master_id_colnames = []
    master_to_asset_list_filepath = None
-    phase = True
+    phase = False
    ecosurv_landlords = None
    asset_list_header = 0
    landlord_block_reference = None
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@ -347,7 +347,8 @@ class SearchEpc:
                # We update the data with the correct uprn
                if self.uprn:
                    for x in api_response["response"]["rows"]:
-                        x["uprn"] = self.uprn
+                        if pd.isnull(x["uprn"]):
+                            x["uprn"] = self.uprn

                data["rows"].extend(api_response["response"]["rows"])

@ -357,6 +358,8 @@ class SearchEpc:
            row for row in data["rows"]
            if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
        ]
+        # Overwrite the data
+        self.data = data

        if data["rows"]:
            api_response["msg"] = self.SUCCESS
--- a/backend/engine/engine.py
+++ b/backend/engine/engine.py
@ -145,14 +145,17 @@ def extract_portfolio_aggregation_data(
        cost = sum([r["total"] for r in default_recommendations])
        sap_point_improvement = sum([r["sap_points"] for r in default_recommendations])

-        lower_bound_valuation_uplift = (
-            property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
-            property_value_increase_ranges[p.id]["current_value"]
-        )
-        upper_bound_valuation_uplift = (
-            property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
-            property_value_increase_ranges[p.id]["current_value"]
-        )
+        if not pd.isnull(property_value_increase_ranges[p.id]["current_value"]):
+            lower_bound_valuation_uplift = (
+                property_value_increase_ranges[p.id]["lower_bound_increased_value"] -
+                property_value_increase_ranges[p.id]["current_value"]
+            )
+            upper_bound_valuation_uplift = (
+                property_value_increase_ranges[p.id]["upper_bound_increased_value"] -
+                property_value_increase_ranges[p.id]["current_value"]
+            )
+        else:
+            lower_bound_valuation_uplift, upper_bound_valuation_uplift = 0, 0

        agg_data.append({
            "pre_retrofit_epc": p.data["current-energy-rating"],
@ -523,6 +526,7 @@ async def model_engine(body: PlanTriggerRequest):
                plan_input["built_form"] = plan_input["built_form"].map(built_form_map)

                plan_input = plan_input.to_dict("records")
+
            else:
                raise ValueError("Other formats not yet supported")

@ -549,6 +553,13 @@ async def model_engine(body: PlanTriggerRequest):
        # If we have patches or overrides, we should read them in here
        patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body)

+        if body.file_type == "xlsx" and body.file_format == "domna_asset_list":
+            # We check if we have valution data
+            if not valuation_data and body.valuation_file_path in [None, ""]:
+                # We check plan_input
+                if "domna_valuation" in plan_input[0]:
+                    valuation_data = [{"uprn": x["uprn"], "valuation": x["domna_valuation"]} for x in plan_input]
+
        cleaning_data = read_dataframe_from_s3_parquet(
            bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
        )
@ -563,12 +574,22 @@ async def model_engine(body: PlanTriggerRequest):
            if uprn:
                uprn = int(float(uprn))

+            address1 = config.get("address", None)
+            # Handle domna address list format
+            if pd.isnull(address1) and body.file_format == "domna_asset_list":
+                address1 = config.get("domna_full_address", None)
+
+            address1 = str(int(address1)) if isinstance(address1, float) else str(address1)
+
+            full_address = config["domna_full_address"] if body.file_format == "domna_asset_list" else None
+
            epc_searcher = SearchEpc(
-                address1=str(config["address"]),
+                address1=address1,
                postcode=config["postcode"],
                uprn=uprn,
                auth_token=get_settings().EPC_AUTH_TOKEN,
                os_api_key="",
+                full_address=full_address
            )
            epc_searcher.ordnance_survey_client.built_form = config.get("built_form", None)
            epc_searcher.ordnance_survey_client.property_type = config.get("property_type", None)
@ -1176,9 +1197,10 @@ async def model_engine(body: PlanTriggerRequest):

                    upload_funding(session, p, new_plan_id, recommendations_to_upload)

-                    property_valuation_increases.append(
-                        valuations["average_increased_value"] - valuations["current_value"]
-                    )
+                    if valuations["current_value"] > 0:
+                        property_valuation_increases.append(
+                            valuations["average_increased_value"] - valuations["current_value"]
+                        )

                # Commit the session after each batch
                session.commit()
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@ -219,12 +219,19 @@ class PropertyValuation:
        current_epc = property_instance.data["current-energy-rating"]

        if not current_value:
+            # In this case, we return a % improvement rather than an absolute
+            relative_improvement = cls.estimate_valuation_improvement(
+                current_value=1,
+                current_epc=current_epc,
+                target_epc=target_epc,
+                total_cost=1
+            )
            return {
                "current_value": 0,
-                "lower_bound_increased_value": 0,
-                "upper_bound_increased_value": 0,
-                "average_increased_value": 0,
-                "average_increase": 0
+                "lower_bound_increased_value": relative_improvement["lower_bound_increased_value"] - 1,
+                "upper_bound_increased_value": relative_improvement["upper_bound_increased_value"] - 1,
+                "average_increased_value": relative_improvement["average_increased_value"] - 1,
+                "average_increase": relative_improvement["average_increase"]
            }

        return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost)
--- a/etl/webscrape/Zoopla.py
+++ b/etl/webscrape/Zoopla.py
@ -1,38 +1,111 @@
-# Initial Code
-
-from seleniumbase import SB
+from bs4 import BeautifulSoup
+import pandas as pd
 import time
+from stealth_requests import StealthSession
+import random
+from multiprocessing import Pool
+from tqdm import tqdm

-uprns = [
-    100071297618,
-    100080893397,
-    100060778033,
-    200004793081,
-    100071265143,
-    100071297618,
-    100080893397,
-    100060778033,
-    200004793081,
-    100071265143,
-]
+ENGINES = ["safari", "chrome"]

-estimate_list = []

-for uprn in uprns:
+def scrape_all_estimates(session, url):
+    # Rotate impersonation per request
+    resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
+    page_source = BeautifulSoup(resp.text, "html.parser")
+    estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
+    is_blocked = len(estimates) == 0
+    return estimates, is_blocked

-    # Probably can change the timings here
-    time.sleep(5)
-    with SB(uc=True) as sb:
-        sb.uc_open_with_reconnect(
-            f"https://www.zoopla.co.uk/property/uprn/{uprn}/",
-            3,
+
+def parallel_task(url):
+    # No impersonate argument here
+    with StealthSession() as session:
+        estimates, is_blocked = scrape_all_estimates(session, url)
+
+        while is_blocked:
+            print(f"Blocked by Zoopla for URL: {url}")
+            time.sleep(random.uniform(0, 1))
+            estimates, is_blocked = scrape_all_estimates(session, url)
+
+        low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
+        middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
+        high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
+
+    return {
+        "URL": url,
+        "Low Estimate": low_estimate,
+        "Middle Estimate": middle_estimate,
+        "High Estimate": high_estimate,
+    }
+
+
+def parse_price(p):
+    p = p.replace("£", "").strip().lower()
+    if p.endswith("k"):
+        return float(p[:-1]) * 1000
+    elif p.endswith("m"):
+        return float(p[:-1]) * 1_000_000
+    else:
+        return float(p)
+
+
+# def parallel_task(url):
+#     with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
+#         estimates, is_blocked = scrape_all_estimates(session, url)
+#
+#         while is_blocked:
+#             # Will need to wait and retry if blocked by Zoopla
+#             print(f"Blocked by Zoopla for URL: {url}")
+#             sleep_factor = random.uniform(0, 1)  # Random delay to avoid detection
+#             time.sleep(sleep_factor * 1)
+#             estimates, is_blocked = scrape_all_estimates(session, url)
+#
+#         low_estimate = (
+#             estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
+#         )  # Find all span elements with data-testid="low-estimate"
+#         middle_estimate = (
+#             estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
+#         )  # Find all span elements with data-testid="middle-estimate"
+#         high_estimate = (
+#             estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
+#         )  # Find all span elements with data-testid="high-estimate-blurred"
+#
+#     return {
+#         "URL": url,
+#         "Low Estimate": low_estimate,
+#         "Middle Estimate": middle_estimate,
+#         "High Estimate": high_estimate,
+#     }
+
+
+if __name__ == "__main__":
+    # Get a SAL
+    asset_list = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
+        "Standardised.xlsx",
+        sheet_name="Standardised Asset List"
+    )
+    asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
+    uprns = asset_list["epc_os_uprn"].tolist()
+    urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
+
+    with Pool(processes=5) as pool:
+        estimates_list = list(
+            tqdm(
+                pool.imap(parallel_task, urls),
+                total=len(urls),
+            )
        )

-        soup = sb.get_beautiful_soup()
+    df = pd.DataFrame(estimates_list)
+    # Extract UPRN from URL
+    df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
+    df["valuation"] = df["Middle Estimate"].apply(parse_price)
+    df.to_csv("zoopla_estimates.csv", index=False)

-        estimates = soup.find_all("div", {"data-testid": "sale-estimate"})
-        # Can change the way we extract the text here
-        estimate_text = (
-            estimates[-1].find_all("p")[-1].find_all("span")[-1]["aria-label"]
-        )
-        estimate_list.append(estimate_text)
+    df["uprn"] = df["uprn"].astype(int).astype(str)
+
+    asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
+        "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
+    )
--- a/etl/webscrape/requirements.txt
+++ b/etl/webscrape/requirements.txt
@ -0,0 +1,5 @@
+beautifulsoup4>=4.12.0
+pandas>=2.0.0
+stealth-requests>=1.0.7
+tqdm>=4.65.0
+openpyxl