mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
206 lines
6.4 KiB
Python
206 lines
6.4 KiB
Python
from bs4 import BeautifulSoup
|
||
import pandas as pd
|
||
import time
|
||
from stealth_requests import StealthSession
|
||
import random
|
||
import os
|
||
from multiprocessing import Pool
|
||
from tqdm import tqdm
|
||
import re
|
||
import json
|
||
|
||
ENGINES = ["safari", "chrome"]
|
||
CACHE_DIR = "zoopla_cache"
|
||
os.makedirs(CACHE_DIR, exist_ok=True)
|
||
|
||
|
||
def random_delay():
|
||
"""Pause randomly between requests (0.5–2 s)."""
|
||
time.sleep(random.uniform(0.5, 2))
|
||
|
||
|
||
def extract_feature(soup, icon_id):
|
||
tag = soup.find("use", href=f"#{icon_id}")
|
||
if tag:
|
||
parent = tag.find_parent("div", class_="_1pbf8i53")
|
||
if parent:
|
||
text = parent.get_text(strip=True)
|
||
return text
|
||
return None
|
||
|
||
|
||
def extract_embedded_json(text):
|
||
"""
|
||
Extract embedded property JSON containing attributes, energy, estimates, and sales history.
|
||
"""
|
||
# Try to grab everything after "attributes"
|
||
match = re.search(
|
||
r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]',
|
||
text,
|
||
re.DOTALL
|
||
)
|
||
if match:
|
||
snippet = "{" + match.group(0) + "}"
|
||
snippet = re.sub(r"\\u0022", '"', snippet)
|
||
snippet = re.sub(r",(\s*[}\]])", r"\1", snippet)
|
||
try:
|
||
return json.loads(snippet)
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
# fallback for independent keys
|
||
result = {}
|
||
for key in [
|
||
"attributes", "energy", "rentEstimate",
|
||
"saleEstimate", "saleHistory", "historicSales"
|
||
]:
|
||
key_match = re.search(rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', text, re.DOTALL)
|
||
if key_match:
|
||
try:
|
||
result[key] = json.loads(key_match.group(1))
|
||
except Exception:
|
||
pass
|
||
return result
|
||
|
||
|
||
def scrape_all_estimates(session, url):
|
||
"""Scrape valuation estimates for one Zoopla property URL."""
|
||
resp = session.get(url, impersonate=random.choice(ENGINES))
|
||
html = resp.text
|
||
page_source = BeautifulSoup(resp.text, "html.parser")
|
||
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
|
||
|
||
data = extract_embedded_json(html)
|
||
|
||
is_blocked = len(estimates) == 0
|
||
|
||
return {
|
||
"estimates": estimates,
|
||
"is_blocked": is_blocked,
|
||
"response_html": html,
|
||
"attributes": data.get("attributes"),
|
||
"rent": data.get("rentEstimate"),
|
||
"historicSales": data.get("historicSales"),
|
||
}
|
||
|
||
|
||
def extract_estimates(estimates):
|
||
"""Extract low, mid, and high estimates from parsed HTML."""
|
||
est = estimates[0]
|
||
low = est.find("span", {"data-testid": "low-estimate-blurred"}).text
|
||
mid = est.find("p", {"data-testid": "estimate-blurred"}).text
|
||
high = est.find("span", {"data-testid": "high-estimate-blurred"}).text
|
||
return low, mid, high
|
||
|
||
|
||
def cache_path_for_url(url):
|
||
"""Return a deterministic local cache path for a URL."""
|
||
uprn = url.split("/")[-2]
|
||
return os.path.join(CACHE_DIR, f"{uprn}.html")
|
||
|
||
|
||
def parallel_task(url):
|
||
"""Main worker function executed in each process."""
|
||
cache_path = cache_path_for_url(url)
|
||
|
||
# Use cached file if it exists
|
||
if os.path.exists(cache_path):
|
||
html = open(cache_path, "r").read()
|
||
page_source = BeautifulSoup(html, "html.parser")
|
||
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
|
||
data = extract_embedded_json(html)
|
||
history_sales = data.get("historicSales", [{}])
|
||
if len(history_sales) == 0:
|
||
history_sales = [{}]
|
||
|
||
if estimates:
|
||
low, mid, high = extract_estimates(estimates)
|
||
return {
|
||
"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
|
||
**data.get("attributes", {}), **data.get("rentEstimate", {}),
|
||
**history_sales[0]
|
||
}
|
||
|
||
# Otherwise scrape live
|
||
with StealthSession() as session:
|
||
attempts = 0
|
||
while attempts < 5:
|
||
output = scrape_all_estimates(session, url)
|
||
if not output["is_blocked"] and output["estimates"]:
|
||
open(cache_path, "w").write(output["html"])
|
||
low, mid, high = extract_estimates(output["estimates"])
|
||
history_sales = output.get("historicSales", [{}])
|
||
if len(history_sales) == 0:
|
||
history_sales = [{}]
|
||
return {
|
||
"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high,
|
||
**output.get("attributes", {}),
|
||
**output.get("rent", {}),
|
||
**history_sales[0]
|
||
}
|
||
attempts += 1
|
||
print(f"[Attempt {attempts}] Blocked or empty for {url}")
|
||
random_delay()
|
||
|
||
# If still blocked, return placeholders
|
||
return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None}
|
||
|
||
|
||
def parse_price(p):
|
||
if p is None:
|
||
return None
|
||
|
||
p = p.replace("£", "").strip().lower()
|
||
if not p:
|
||
return None
|
||
if p.endswith("k"):
|
||
return float(p[:-1]) * 1_000
|
||
elif p.endswith("m"):
|
||
return float(p[:-1]) * 1_000_000
|
||
else:
|
||
try:
|
||
return float(p.replace(",", ""))
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# Load portfolio
|
||
asset_list = pd.read_excel(
|
||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - "
|
||
"Standardised - partial UPRN fill.xlsx",
|
||
sheet_name="Standardised Asset List"
|
||
)
|
||
asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])]
|
||
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
|
||
uprns = asset_list["epc_os_uprn"].tolist()
|
||
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
|
||
|
||
# Limit concurrency to avoid blocks
|
||
with Pool(processes=2) as pool: # fewer processes = fewer fingerprints
|
||
estimates_list = list(
|
||
tqdm(pool.imap(parallel_task, urls), total=len(urls))
|
||
)
|
||
|
||
df = pd.DataFrame(estimates_list)
|
||
|
||
print(df.head())
|
||
|
||
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
|
||
df["valuation"] = df["Middle Estimate"].apply(parse_price)
|
||
|
||
df.to_csv("zoopla_estimates.csv", index=False)
|
||
|
||
# Merge with asset list
|
||
merged = asset_list.merge(
|
||
df[["uprn", "valuation"]],
|
||
left_on="epc_os_uprn",
|
||
right_on="uprn",
|
||
how="left"
|
||
)
|
||
merged.to_excel(
|
||
"20251029 AL Portfolio - Standardised - with valuations.xlsx",
|
||
index=False
|
||
)
|
||
|
||
print("Done. Results saved.")
|