improving valuations scraper

2026-06-08 11:17:27 +00:00 · 2025-10-29 20:26:45 +00:00 · 2025-10-29 20:26:45 +00:00 · 9c5d68f55f
commit 9c5d68f55f
parent 27de54adef
5 changed files with 92 additions and 74 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
--- a/asset_list/app.py
+++ b/asset_list/app.py
@ -298,13 +298,13 @@ def app():
    landlord_block_reference = None

    # Project from Nick
-    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/Sep2025 Project"
-    data_filename = "AL Test.xlsx"
-    sheet_name = "Sheet1"
-    postcode_column = 'postcode'
+    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio"
+    data_filename = "22.10 AL Portfolio.xlsx"
+    sheet_name = "22.10 AL Portfolio"
+    postcode_column = 'Postcode'
    address1_column = None
    address1_method = 'house_number_extraction'
-    fulladdress_column = "address"
+    fulladdress_column = "Address"
    address_cols_to_concat = []
    missing_postcodes_method = None
    landlord_year_built = None
@ -315,7 +315,7 @@ def app():
    landlord_roof_construction = None
    landlord_heating_system = None
    landlord_existing_pv = None
-    landlord_property_id = "row_id"
+    landlord_property_id = "Row ID"
    landlord_sap = None
    outcomes_filename = None
    outcomes_sheetname = None
--- a/etl/webscrape/Zoopla.py
+++ b/etl/webscrape/Zoopla.py
@ -3,109 +3,126 @@ import pandas as pd
 import time
 from stealth_requests import StealthSession
 import random
+import os
 from multiprocessing import Pool
 from tqdm import tqdm

 ENGINES = ["safari", "chrome"]
+CACHE_DIR = "zoopla_cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+
+
+def random_delay():
+    """Pause randomly between requests (0.5–2 s)."""
+    time.sleep(random.uniform(0.5, 2))


 def scrape_all_estimates(session, url):
-    # Rotate impersonation per request
-    resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
+    """Scrape valuation estimates for one Zoopla property URL."""
+    resp = session.get(url, impersonate=random.choice(ENGINES))
    page_source = BeautifulSoup(resp.text, "html.parser")
    estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
    is_blocked = len(estimates) == 0
-    return estimates, is_blocked
+    return estimates, is_blocked, resp.text
+
+
+def extract_estimates(estimates):
+    """Extract low, mid, and high estimates from parsed HTML."""
+    est = estimates[0]
+    low = est.find("span", {"data-testid": "low-estimate-blurred"}).text
+    mid = est.find("p", {"data-testid": "estimate-blurred"}).text
+    high = est.find("span", {"data-testid": "high-estimate-blurred"}).text
+    return low, mid, high
+
+
+def cache_path_for_url(url):
+    """Return a deterministic local cache path for a URL."""
+    uprn = url.split("/")[-2]
+    return os.path.join(CACHE_DIR, f"{uprn}.html")


 def parallel_task(url):
-    # No impersonate argument here
+    """Main worker function executed in each process."""
+    cache_path = cache_path_for_url(url)
+
+    # Use cached file if it exists
+    if os.path.exists(cache_path):
+        html = open(cache_path, "r").read()
+        page_source = BeautifulSoup(html, "html.parser")
+        estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
+        if estimates:
+            low, mid, high = extract_estimates(estimates)
+            return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
+
+    # Otherwise scrape live
    with StealthSession() as session:
-        estimates, is_blocked = scrape_all_estimates(session, url)
+        attempts = 0
+        while attempts < 5:
+            estimates, is_blocked, html = scrape_all_estimates(session, url)
+            if not is_blocked and estimates:
+                open(cache_path, "w").write(html)
+                low, mid, high = extract_estimates(estimates)
+                return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
+            attempts += 1
+            print(f"[Attempt {attempts}] Blocked or empty for {url}")
+            random_delay()

-        while is_blocked:
-            print(f"Blocked by Zoopla for URL: {url}")
-            time.sleep(random.uniform(0, 1))
-            estimates, is_blocked = scrape_all_estimates(session, url)
-
-        low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
-        middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
-        high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
-
-    return {
-        "URL": url,
-        "Low Estimate": low_estimate,
-        "Middle Estimate": middle_estimate,
-        "High Estimate": high_estimate,
-    }
+        # If still blocked, return placeholders
+        return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None}


 def parse_price(p):
+    if p is None:
+        return None
+
    p = p.replace("£", "").strip().lower()
+    if not p:
+        return None
    if p.endswith("k"):
-        return float(p[:-1]) * 1000
+        return float(p[:-1]) * 1_000
    elif p.endswith("m"):
        return float(p[:-1]) * 1_000_000
    else:
-        return float(p)
-
-
-# def parallel_task(url):
-#     with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
-#         estimates, is_blocked = scrape_all_estimates(session, url)
-#
-#         while is_blocked:
-#             # Will need to wait and retry if blocked by Zoopla
-#             print(f"Blocked by Zoopla for URL: {url}")
-#             sleep_factor = random.uniform(0, 1)  # Random delay to avoid detection
-#             time.sleep(sleep_factor * 1)
-#             estimates, is_blocked = scrape_all_estimates(session, url)
-#
-#         low_estimate = (
-#             estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
-#         )  # Find all span elements with data-testid="low-estimate"
-#         middle_estimate = (
-#             estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
-#         )  # Find all span elements with data-testid="middle-estimate"
-#         high_estimate = (
-#             estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
-#         )  # Find all span elements with data-testid="high-estimate-blurred"
-#
-#     return {
-#         "URL": url,
-#         "Low Estimate": low_estimate,
-#         "Middle Estimate": middle_estimate,
-#         "High Estimate": high_estimate,
-#     }
+        try:
+            return float(p.replace(",", ""))
+        except ValueError:
+            return None


 if __name__ == "__main__":
-    # Get a SAL
+    # Load portfolio
    asset_list = pd.read_excel(
-        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
-        "Standardised.xlsx",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - "
+        "Standardised - partial UPRN fill.xlsx",
        sheet_name="Standardised Asset List"
    )
+    asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])]
    asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
    uprns = asset_list["epc_os_uprn"].tolist()
    urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]

-    with Pool(processes=5) as pool:
+    # Limit concurrency to avoid blocks
+    with Pool(processes=2) as pool:  # fewer processes = fewer fingerprints
        estimates_list = list(
-            tqdm(
-                pool.imap(parallel_task, urls),
-                total=len(urls),
-            )
+            tqdm(pool.imap(parallel_task, urls), total=len(urls))
        )

    df = pd.DataFrame(estimates_list)
-    # Extract UPRN from URL
    df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
    df["valuation"] = df["Middle Estimate"].apply(parse_price)
+
    df.to_csv("zoopla_estimates.csv", index=False)

-    df["uprn"] = df["uprn"].astype(int).astype(str)
-
-    asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
-        "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
+    # Merge with asset list
+    merged = asset_list.merge(
+        df[["uprn", "valuation"]],
+        left_on="epc_os_uprn",
+        right_on="uprn",
+        how="left"
    )
+    merged.to_excel(
+        "20251029 AL Portfolio - Standardised - with valuations.xlsx",
+        index=False
+    )
+
+    print("Done. Results saved.")
--- a/recommendations/optimiser/funding_optimiser.py
+++ b/recommendations/optimiser/funding_optimiser.py
@ -427,8 +427,9 @@ def optimise_with_funding_paths(p, input_measures, housing_type, funding: Fundin
    solutions["meets_upgrade_target"] = solutions["total_gain"] >= target_gain - 0.1

    # If we have packages that are fundable, but do not meet the upgrade target, we can run a final optimisation pass
-    if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty:
-        logger.info("We have some packages that are fundable but do not meet the target gain")
+    # Turned off logging - too noisy
+    # if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty:
+    #     logger.info("We have some packages that are fundable but do not meet the target gain")

    # We now can calculate the project ABS, which subtracts from the cost, but this is only relevant for ECO4
    solutions["starting_sap"] = p.data["current-energy-efficiency"]