Model/etl/webscrape/Zoopla.py
2025-11-10 20:46:57 +00:00

206 lines
6.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from bs4 import BeautifulSoup
import pandas as pd
import time
from stealth_requests import StealthSession
import random
import os
from multiprocessing import Pool
from tqdm import tqdm
import re
import json
ENGINES = ["safari", "chrome"]
CACHE_DIR = "zoopla_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
def random_delay():
"""Pause randomly between requests (0.52 s)."""
time.sleep(random.uniform(0.5, 2))
def extract_feature(soup, icon_id):
tag = soup.find("use", href=f"#{icon_id}")
if tag:
parent = tag.find_parent("div", class_="_1pbf8i53")
if parent:
text = parent.get_text(strip=True)
return text
return None
def extract_embedded_json(text):
"""
Extract embedded property JSON containing attributes, energy, estimates, and sales history.
"""
# Try to grab everything after "attributes"
match = re.search(
r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]',
text,
re.DOTALL
)
if match:
snippet = "{" + match.group(0) + "}"
snippet = re.sub(r"\\u0022", '"', snippet)
snippet = re.sub(r",(\s*[}\]])", r"\1", snippet)
try:
return json.loads(snippet)
except json.JSONDecodeError:
pass
# fallback for independent keys
result = {}
for key in [
"attributes", "energy", "rentEstimate",
"saleEstimate", "saleHistory", "historicSales"
]:
key_match = re.search(rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', text, re.DOTALL)
if key_match:
try:
result[key] = json.loads(key_match.group(1))
except Exception:
pass
return result
def scrape_all_estimates(session, url):
"""Scrape valuation estimates for one Zoopla property URL."""
resp = session.get(url, impersonate=random.choice(ENGINES))
html = resp.text
page_source = BeautifulSoup(resp.text, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
data = extract_embedded_json(html)
is_blocked = len(estimates) == 0
return {
"estimates": estimates,
"is_blocked": is_blocked,
"response_html": html,
"attributes": data.get("attributes"),
"rent": data.get("rentEstimate"),
"historicSales": data.get("historicSales"),
}
def extract_estimates(estimates):
"""Extract low, mid, and high estimates from parsed HTML."""
est = estimates[0]
low = est.find("span", {"data-testid": "low-estimate-blurred"}).text
mid = est.find("p", {"data-testid": "estimate-blurred"}).text
high = est.find("span", {"data-testid": "high-estimate-blurred"}).text
return low, mid, high
def cache_path_for_url(url):
"""Return a deterministic local cache path for a URL."""
uprn = url.split("/")[-2]
return os.path.join(CACHE_DIR, f"{uprn}.html")
def parallel_task(url):
"""Main worker function executed in each process."""
cache_path = cache_path_for_url(url)
# Use cached file if it exists
if os.path.exists(cache_path):
html = open(cache_path, "r").read()
page_source = BeautifulSoup(html, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
data = extract_embedded_json(html)
history_sales = data.get("historicSales", [{}])
if len(history_sales) == 0:
history_sales = [{}]
if estimates:
low, mid, high = extract_estimates(estimates)
return {
"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
**data.get("attributes", {}), **data.get("rentEstimate", {}),
**history_sales[0]
}
# Otherwise scrape live
with StealthSession() as session:
attempts = 0
while attempts < 5:
output = scrape_all_estimates(session, url)
if not output["is_blocked"] and output["estimates"]:
open(cache_path, "w").write(output["html"])
low, mid, high = extract_estimates(output["estimates"])
history_sales = output.get("historicSales", [{}])
if len(history_sales) == 0:
history_sales = [{}]
return {
"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
**output.get("attributes", {}),
**output.get("rent", {}),
**history_sales[0]
}
attempts += 1
print(f"[Attempt {attempts}] Blocked or empty for {url}")
random_delay()
# If still blocked, return placeholders
return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None}
def parse_price(p):
if p is None:
return None
p = p.replace("£", "").strip().lower()
if not p:
return None
if p.endswith("k"):
return float(p[:-1]) * 1_000
elif p.endswith("m"):
return float(p[:-1]) * 1_000_000
else:
try:
return float(p.replace(",", ""))
except ValueError:
return None
if __name__ == "__main__":
# Load portfolio
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - "
"Standardised - partial UPRN fill.xlsx",
sheet_name="Standardised Asset List"
)
asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])]
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
uprns = asset_list["epc_os_uprn"].tolist()
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
# Limit concurrency to avoid blocks
with Pool(processes=2) as pool: # fewer processes = fewer fingerprints
estimates_list = list(
tqdm(pool.imap(parallel_task, urls), total=len(urls))
)
df = pd.DataFrame(estimates_list)
print(df.head())
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
df["valuation"] = df["Middle Estimate"].apply(parse_price)
df.to_csv("zoopla_estimates.csv", index=False)
# Merge with asset list
merged = asset_list.merge(
df[["uprn", "valuation"]],
left_on="epc_os_uprn",
right_on="uprn",
how="left"
)
merged.to_excel(
"20251029 AL Portfolio - Standardised - with valuations.xlsx",
index=False
)
print("Done. Results saved.")