Model/etl/webscrape/Zoopla.py

from bs4 import BeautifulSoup
import pandas as pd
import time
from stealth_requests import StealthSession
import random
from multiprocessing import Pool
from tqdm import tqdm

ENGINES = ["safari", "chrome"]


def scrape_all_estimates(session, url):
    # Rotate impersonation per request
    resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
    page_source = BeautifulSoup(resp.text, "html.parser")
    estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
    is_blocked = len(estimates) == 0
    return estimates, is_blocked


def parallel_task(url):
    # No impersonate argument here
    with StealthSession() as session:
        estimates, is_blocked = scrape_all_estimates(session, url)

        while is_blocked:
            print(f"Blocked by Zoopla for URL: {url}")
            time.sleep(random.uniform(0, 1))
            estimates, is_blocked = scrape_all_estimates(session, url)

        low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
        middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
        high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text

    return {
        "URL": url,
        "Low Estimate": low_estimate,
        "Middle Estimate": middle_estimate,
        "High Estimate": high_estimate,
    }


def parse_price(p):
    p = p.replace("£", "").strip().lower()
    if p.endswith("k"):
        return float(p[:-1]) * 1000
    elif p.endswith("m"):
        return float(p[:-1]) * 1_000_000
    else:
        return float(p)


# def parallel_task(url):
#     with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
#         estimates, is_blocked = scrape_all_estimates(session, url)
#
#         while is_blocked:
#             # Will need to wait and retry if blocked by Zoopla
#             print(f"Blocked by Zoopla for URL: {url}")
#             sleep_factor = random.uniform(0, 1)  # Random delay to avoid detection
#             time.sleep(sleep_factor * 1)
#             estimates, is_blocked = scrape_all_estimates(session, url)
#
#         low_estimate = (
#             estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
#         )  # Find all span elements with data-testid="low-estimate"
#         middle_estimate = (
#             estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
#         )  # Find all span elements with data-testid="middle-estimate"
#         high_estimate = (
#             estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
#         )  # Find all span elements with data-testid="high-estimate-blurred"
#
#     return {
#         "URL": url,
#         "Low Estimate": low_estimate,
#         "Middle Estimate": middle_estimate,
#         "High Estimate": high_estimate,
#     }


if __name__ == "__main__":
    # Get a SAL
    asset_list = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
        "Standardised.xlsx",
        sheet_name="Standardised Asset List"
    )
    asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
    uprns = asset_list["epc_os_uprn"].tolist()
    urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]

    with Pool(processes=5) as pool:
        estimates_list = list(
            tqdm(
                pool.imap(parallel_task, urls),
                total=len(urls),
            )
        )

    df = pd.DataFrame(estimates_list)
    # Extract UPRN from URL
    df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
    df["valuation"] = df["Middle Estimate"].apply(parse_price)
    df.to_csv("zoopla_estimates.csv", index=False)

    df["uprn"] = df["uprn"].astype(int).astype(str)

    asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
        "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
    )