from bs4 import BeautifulSoup import pandas as pd import time from stealth_requests import StealthSession import random import os from multiprocessing import Pool from tqdm import tqdm import re import json ENGINES = ["safari", "chrome"] CACHE_DIR = "zoopla_cache" os.makedirs(CACHE_DIR, exist_ok=True) def random_delay(): """Pause randomly between requests (0.5–2 s).""" time.sleep(random.uniform(0.5, 2)) def extract_feature(soup, icon_id): tag = soup.find("use", href=f"#{icon_id}") if tag: parent = tag.find_parent("div", class_="_1pbf8i53") if parent: text = parent.get_text(strip=True) return text return None def extract_embedded_json(text): """ Extract embedded property JSON containing attributes, energy, estimates, and sales history. """ # Try to grab everything after "attributes" match = re.search( r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]', text, re.DOTALL ) if match: snippet = "{" + match.group(0) + "}" snippet = re.sub(r"\\u0022", '"', snippet) snippet = re.sub(r",(\s*[}\]])", r"\1", snippet) try: return json.loads(snippet) except json.JSONDecodeError: pass # fallback for independent keys result = {} for key in [ "attributes", "energy", "rentEstimate", "saleEstimate", "saleHistory", "historicSales" ]: key_match = re.search(rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', text, re.DOTALL) if key_match: try: result[key] = json.loads(key_match.group(1)) except Exception: pass return result def scrape_all_estimates(session, url): """Scrape valuation estimates for one Zoopla property URL.""" resp = session.get(url, impersonate=random.choice(ENGINES)) html = resp.text page_source = BeautifulSoup(resp.text, "html.parser") estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) data = extract_embedded_json(html) is_blocked = len(estimates) == 0 return { "estimates": estimates, "is_blocked": is_blocked, "response_html": html, "attributes": data.get("attributes"), "rent": data.get("rentEstimate"), "historicSales": data.get("historicSales"), } def extract_estimates(estimates): """Extract low, mid, and high estimates from parsed HTML.""" est = estimates[0] low = est.find("span", {"data-testid": "low-estimate-blurred"}).text mid = est.find("p", {"data-testid": "estimate-blurred"}).text high = est.find("span", {"data-testid": "high-estimate-blurred"}).text return low, mid, high def cache_path_for_url(url): """Return a deterministic local cache path for a URL.""" uprn = url.split("/")[-2] return os.path.join(CACHE_DIR, f"{uprn}.html") def parallel_task(url): """Main worker function executed in each process.""" cache_path = cache_path_for_url(url) # Use cached file if it exists if os.path.exists(cache_path): html = open(cache_path, "r").read() page_source = BeautifulSoup(html, "html.parser") estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) data = extract_embedded_json(html) history_sales = data.get("historicSales", [{}]) if len(history_sales) == 0: history_sales = [{}] if estimates: low, mid, high = extract_estimates(estimates) return { "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high, **data.get("attributes", {}), **data.get("rentEstimate", {}), **history_sales[0] } # Otherwise scrape live with StealthSession() as session: attempts = 0 while attempts < 5: output = scrape_all_estimates(session, url) if not output["is_blocked"] and output["estimates"]: open(cache_path, "w").write(output["html"]) low, mid, high = extract_estimates(output["estimates"]) history_sales = output.get("historicSales", [{}]) if len(history_sales) == 0: history_sales = [{}] return { "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high, **output.get("attributes", {}), **output.get("rent", {}), **history_sales[0] } attempts += 1 print(f"[Attempt {attempts}] Blocked or empty for {url}") random_delay() # If still blocked, return placeholders return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None} def parse_price(p): if p is None: return None p = p.replace("£", "").strip().lower() if not p: return None if p.endswith("k"): return float(p[:-1]) * 1_000 elif p.endswith("m"): return float(p[:-1]) * 1_000_000 else: try: return float(p.replace(",", "")) except ValueError: return None if __name__ == "__main__": # Load portfolio asset_list = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - " "Standardised - partial UPRN fill.xlsx", sheet_name="Standardised Asset List" ) asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])] asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str) uprns = asset_list["epc_os_uprn"].tolist() urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns] # Limit concurrency to avoid blocks with Pool(processes=2) as pool: # fewer processes = fewer fingerprints estimates_list = list( tqdm(pool.imap(parallel_task, urls), total=len(urls)) ) df = pd.DataFrame(estimates_list) print(df.head()) df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/") df["valuation"] = df["Middle Estimate"].apply(parse_price) df.to_csv("zoopla_estimates.csv", index=False) # Merge with asset list merged = asset_list.merge( df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left" ) merged.to_excel( "20251029 AL Portfolio - Standardised - with valuations.xlsx", index=False ) print("Done. Results saved.")