Model/etl/webscrape/Zoopla.py

from bs4 import BeautifulSoup
import pandas as pd
import time
from stealth_requests import StealthSession
import random
import os
from multiprocessing import Pool
from tqdm import tqdm
import re
import json

ENGINES = ["safari", "chrome"]
CACHE_DIR = "zoopla_cache"
os.makedirs(CACHE_DIR, exist_ok=True)


def random_delay():
    time.sleep(random.uniform(0.5, 2))


def extract_embedded_json(text):
    match = re.search(
        r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]',
        text,
        re.DOTALL
    )
    if match:
        snippet = "{" + match.group(0) + "}"
        snippet = re.sub(r"\\u0022", '"', snippet)
        snippet = re.sub(r",(\s*[}\]])", r"\1", snippet)
        try:
            return json.loads(snippet)
        except json.JSONDecodeError:
            pass

    result = {}
    for key in [
        "attributes", "energy", "rentEstimate",
        "saleEstimate", "saleHistory", "historicSales"
    ]:
        key_match = re.search(
            rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])',
            text,
            re.DOTALL
        )
        if key_match:
            try:
                result[key] = json.loads(key_match.group(1))
            except Exception:
                pass
    return result


def scrape_all_estimates(session, url):
    resp = session.get(url, impersonate=random.choice(ENGINES))
    html = resp.text
    soup = BeautifulSoup(html, "html.parser")
    estimates = soup.find_all("div", {"data-testid": "sale-estimate"})
    data = extract_embedded_json(html)

    return {
        "estimates": estimates,
        "is_blocked": len(estimates) == 0,
        "response_html": html,
        "attributes": data.get("attributes", {}),
        "rentEstimate": data.get("rentEstimate", {}),
        "historicSales": data.get("historicSales", []),
    }


def extract_estimates(estimates):
    est = estimates[0]
    low = est.find("span", {"data-testid": "low-estimate-blurred"}).text
    mid = est.find("p", {"data-testid": "estimate-blurred"}).text
    high = est.find("span", {"data-testid": "high-estimate-blurred"}).text
    return low, mid, high


def cache_path_for_url(url):
    uprn = url.split("/")[-2]
    return os.path.join(CACHE_DIR, f"{uprn}.html")


def parse_cached_html(url, html):
    soup = BeautifulSoup(html, "html.parser")
    estimates = soup.find_all("div", {"data-testid": "sale-estimate"})
    data = extract_embedded_json(html)
    history = data.get("historicSales") or [{}]

    if not estimates:
        return None

    low, mid, high = extract_estimates(estimates)

    return {
        "URL": url,
        "Low Estimate": low,
        "Middle Estimate": mid,
        "High Estimate": high,
        **data.get("attributes", {}),
        **data.get("rentEstimate", {}),
        **history[0],
    }


def parallel_task(url):
    cache_path = cache_path_for_url(url)

    if os.path.exists(cache_path):
        with open(cache_path, "r", encoding="utf-8") as f:
            html = f.read()
        cached = parse_cached_html(url, html)
        if cached:
            return cached

    with StealthSession() as session:
        for attempt in range(5):
            output = scrape_all_estimates(session, url)

            if not output["is_blocked"] and output["estimates"]:
                html = output.get("response_html")
                if html:
                    with open(cache_path, "w", encoding="utf-8") as f:
                        f.write(html)

                history = output.get("historicSales") or [{}]
                low, mid, high = extract_estimates(output["estimates"])

                return {
                    "URL": url,
                    "Low Estimate": low,
                    "Middle Estimate": mid,
                    "High Estimate": high,
                    **output.get("attributes", {}),
                    **output.get("rentEstimate", {}),
                    **history[0],
                }

            random_delay()

    return {
        "URL": url,
        "Low Estimate": None,
        "Middle Estimate": None,
        "High Estimate": None,
    }


def parse_price(p):
    if not p:
        return None

    p = p.replace("£", "").strip().lower()
    if p.endswith("k"):
        return float(p[:-1]) * 1_000
    if p.endswith("m"):
        return float(p[:-1]) * 1_000_000

    try:
        return float(p.replace(",", ""))
    except ValueError:
        return None


if __name__ == "__main__":
    asset_list = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
        "Project/modelling_sample.xlsx",
        sheet_name="Standardised Asset List"
    )

    asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])]
    asset_list = asset_list.drop_duplicates("epc_os_uprn")
    asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)

    uprns = asset_list["epc_os_uprn"].tolist()
    urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]

    with Pool(processes=2) as pool:
        estimates_list = list(
            tqdm(pool.imap(parallel_task, urls), total=len(urls))
        )

    df = pd.DataFrame(estimates_list)
    df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
    df["valuation"] = df["Middle Estimate"].apply(parse_price)

    df.to_csv("zoopla_estimates.csv", index=False)

    merged = asset_list.merge(
        df[["uprn", "valuation"]],
        left_on="epc_os_uprn",
        right_on="uprn",
        how="left"
    )

    merged.to_excel(
        "20251029 AL Portfolio - Standardised - with valuations.xlsx",
        index=False
    )

    print("Done. Results saved.")