from bs4 import BeautifulSoup import pandas as pd import time from stealth_requests import StealthSession import random from multiprocessing import Pool from tqdm import tqdm ENGINES = ["safari", "chrome"] def scrape_all_estimates(session, url): # Rotate impersonation per request resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)]) page_source = BeautifulSoup(resp.text, "html.parser") estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) is_blocked = len(estimates) == 0 return estimates, is_blocked def parallel_task(url): # No impersonate argument here with StealthSession() as session: estimates, is_blocked = scrape_all_estimates(session, url) while is_blocked: print(f"Blocked by Zoopla for URL: {url}") time.sleep(random.uniform(0, 1)) estimates, is_blocked = scrape_all_estimates(session, url) low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text return { "URL": url, "Low Estimate": low_estimate, "Middle Estimate": middle_estimate, "High Estimate": high_estimate, } def parse_price(p): p = p.replace("£", "").strip().lower() if p.endswith("k"): return float(p[:-1]) * 1000 elif p.endswith("m"): return float(p[:-1]) * 1_000_000 else: return float(p) # def parallel_task(url): # with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session: # estimates, is_blocked = scrape_all_estimates(session, url) # # while is_blocked: # # Will need to wait and retry if blocked by Zoopla # print(f"Blocked by Zoopla for URL: {url}") # sleep_factor = random.uniform(0, 1) # Random delay to avoid detection # time.sleep(sleep_factor * 1) # estimates, is_blocked = scrape_all_estimates(session, url) # # low_estimate = ( # estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text # ) # Find all span elements with data-testid="low-estimate" # middle_estimate = ( # estimates[0].find("p", {"data-testid": "estimate-blurred"}).text # ) # Find all span elements with data-testid="middle-estimate" # high_estimate = ( # estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text # ) # Find all span elements with data-testid="high-estimate-blurred" # # return { # "URL": url, # "Low Estimate": low_estimate, # "Middle Estimate": middle_estimate, # "High Estimate": high_estimate, # } if __name__ == "__main__": # Get a SAL asset_list = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - " "Standardised.xlsx", sheet_name="Standardised Asset List" ) asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str) uprns = asset_list["epc_os_uprn"].tolist() urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns] with Pool(processes=5) as pool: estimates_list = list( tqdm( pool.imap(parallel_task, urls), total=len(urls), ) ) df = pd.DataFrame(estimates_list) # Extract UPRN from URL df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/") df["valuation"] = df["Middle Estimate"].apply(parse_price) df.to_csv("zoopla_estimates.csv", index=False) df["uprn"] = df["uprn"].astype(int).astype(str) asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel( "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False )