Model/etl/webscrape/Zoopla.py

from bs4 import BeautifulSoup
import pandas as pd
import time
from stealth_requests import StealthSession
import random
import os
from multiprocessing import Pool
from tqdm import tqdm
import re
import json

ENGINES = ["safari", "chrome"]
CACHE_DIR = "zoopla_cache"
os.makedirs(CACHE_DIR, exist_ok=True)


def random_delay():
    """Pause randomly between requests (0.5–2 s)."""
    time.sleep(random.uniform(0.5, 2))


def extract_feature(soup, icon_id):
    tag = soup.find("use", href=f"#{icon_id}")
    if tag:
        parent = tag.find_parent("div", class_="_1pbf8i53")
        if parent:
            text = parent.get_text(strip=True)
            return text
    return None


def extract_embedded_json(text):
    """
    Extract embedded property JSON containing attributes, energy, estimates, and sales history.
    """
    # Try to grab everything after "attributes"
    match = re.search(
        r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]',
        text,
        re.DOTALL
    )
    if match:
        snippet = "{" + match.group(0) + "}"
        snippet = re.sub(r"\\u0022", '"', snippet)
        snippet = re.sub(r",(\s*[}\]])", r"\1", snippet)
        try:
            return json.loads(snippet)
        except json.JSONDecodeError:
            pass

    # fallback for independent keys
    result = {}
    for key in [
        "attributes", "energy", "rentEstimate",
        "saleEstimate", "saleHistory", "historicSales"
    ]:
        key_match = re.search(rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', text, re.DOTALL)
        if key_match:
            try:
                result[key] = json.loads(key_match.group(1))
            except Exception:
                pass
    return result


def scrape_all_estimates(session, url):
    """Scrape valuation estimates for one Zoopla property URL."""
    resp = session.get(url, impersonate=random.choice(ENGINES))
    html = resp.text
    page_source = BeautifulSoup(resp.text, "html.parser")
    estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})

    data = extract_embedded_json(html)

    is_blocked = len(estimates) == 0

    return {
        "estimates": estimates,
        "is_blocked": is_blocked,
        "response_html": html,
        "attributes": data.get("attributes"),
        "rent": data.get("rentEstimate"),
        "historicSales": data.get("historicSales"),
    }


def extract_estimates(estimates):
    """Extract low, mid, and high estimates from parsed HTML."""
    est = estimates[0]
    low = est.find("span", {"data-testid": "low-estimate-blurred"}).text
    mid = est.find("p", {"data-testid": "estimate-blurred"}).text
    high = est.find("span", {"data-testid": "high-estimate-blurred"}).text
    return low, mid, high


def cache_path_for_url(url):
    """Return a deterministic local cache path for a URL."""
    uprn = url.split("/")[-2]
    return os.path.join(CACHE_DIR, f"{uprn}.html")


def parallel_task(url):
    """Main worker function executed in each process."""
    cache_path = cache_path_for_url(url)

    # Use cached file if it exists
    if os.path.exists(cache_path):
        html = open(cache_path, "r").read()
        page_source = BeautifulSoup(html, "html.parser")
        estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
        data = extract_embedded_json(html)
        history_sales = data.get("historicSales", [{}])
        if len(history_sales) == 0:
            history_sales = [{}]

        if estimates:
            low, mid, high = extract_estimates(estimates)
            return {
                "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
                **data.get("attributes", {}), **data.get("rentEstimate", {}),
                **history_sales[0]
            }

    # Otherwise scrape live
    with StealthSession() as session:
        attempts = 0
        while attempts < 5:
            output = scrape_all_estimates(session, url)
            if not output["is_blocked"] and output["estimates"]:
                open(cache_path, "w").write(output["html"])
                low, mid, high = extract_estimates(output["estimates"])
                history_sales = output.get("historicSales", [{}])
                if len(history_sales) == 0:
                    history_sales = [{}]
                return {
                    "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
                    **output.get("attributes", {}),
                    **output.get("rent", {}),
                    **history_sales[0]
                }
            attempts += 1
            print(f"[Attempt {attempts}] Blocked or empty for {url}")
            random_delay()

        # If still blocked, return placeholders
        return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None}


def parse_price(p):
    if p is None:
        return None

    p = p.replace("£", "").strip().lower()
    if not p:
        return None
    if p.endswith("k"):
        return float(p[:-1]) * 1_000
    elif p.endswith("m"):
        return float(p[:-1]) * 1_000_000
    else:
        try:
            return float(p.replace(",", ""))
        except ValueError:
            return None


if __name__ == "__main__":
    # Load portfolio
    asset_list = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - "
        "Standardised - partial UPRN fill.xlsx",
        sheet_name="Standardised Asset List"
    )
    asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])]
    asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
    uprns = asset_list["epc_os_uprn"].tolist()
    urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]

    # Limit concurrency to avoid blocks
    with Pool(processes=2) as pool:  # fewer processes = fewer fingerprints
        estimates_list = list(
            tqdm(pool.imap(parallel_task, urls), total=len(urls))
        )

    df = pd.DataFrame(estimates_list)

    print(df.head())

    df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
    df["valuation"] = df["Middle Estimate"].apply(parse_price)

    df.to_csv("zoopla_estimates.csv", index=False)

    # Merge with asset list
    merged = asset_list.merge(
        df[["uprn", "valuation"]],
        left_on="epc_os_uprn",
        right_on="uprn",
        how="left"
    )
    merged.to_excel(
        "20251029 AL Portfolio - Standardised - with valuations.xlsx",
        index=False
    )

    print("Done. Results saved.")