Model/etl/webscrape/Zoopla.py
2025-10-27 15:27:32 +00:00

111 lines
3.9 KiB
Python

from bs4 import BeautifulSoup
import pandas as pd
import time
from stealth_requests import StealthSession
import random
from multiprocessing import Pool
from tqdm import tqdm
ENGINES = ["safari", "chrome"]
def scrape_all_estimates(session, url):
# Rotate impersonation per request
resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
page_source = BeautifulSoup(resp.text, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
is_blocked = len(estimates) == 0
return estimates, is_blocked
def parallel_task(url):
# No impersonate argument here
with StealthSession() as session:
estimates, is_blocked = scrape_all_estimates(session, url)
while is_blocked:
print(f"Blocked by Zoopla for URL: {url}")
time.sleep(random.uniform(0, 1))
estimates, is_blocked = scrape_all_estimates(session, url)
low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
return {
"URL": url,
"Low Estimate": low_estimate,
"Middle Estimate": middle_estimate,
"High Estimate": high_estimate,
}
def parse_price(p):
p = p.replace("£", "").strip().lower()
if p.endswith("k"):
return float(p[:-1]) * 1000
elif p.endswith("m"):
return float(p[:-1]) * 1_000_000
else:
return float(p)
# def parallel_task(url):
# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
# estimates, is_blocked = scrape_all_estimates(session, url)
#
# while is_blocked:
# # Will need to wait and retry if blocked by Zoopla
# print(f"Blocked by Zoopla for URL: {url}")
# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection
# time.sleep(sleep_factor * 1)
# estimates, is_blocked = scrape_all_estimates(session, url)
#
# low_estimate = (
# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
# ) # Find all span elements with data-testid="low-estimate"
# middle_estimate = (
# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
# ) # Find all span elements with data-testid="middle-estimate"
# high_estimate = (
# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
# ) # Find all span elements with data-testid="high-estimate-blurred"
#
# return {
# "URL": url,
# "Low Estimate": low_estimate,
# "Middle Estimate": middle_estimate,
# "High Estimate": high_estimate,
# }
if __name__ == "__main__":
# Get a SAL
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
"Standardised.xlsx",
sheet_name="Standardised Asset List"
)
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
uprns = asset_list["epc_os_uprn"].tolist()
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
with Pool(processes=5) as pool:
estimates_list = list(
tqdm(
pool.imap(parallel_task, urls),
total=len(urls),
)
)
df = pd.DataFrame(estimates_list)
# Extract UPRN from URL
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
df["valuation"] = df["Middle Estimate"].apply(parse_price)
df.to_csv("zoopla_estimates.csv", index=False)
df["uprn"] = df["uprn"].astype(int).astype(str)
asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
"Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
)