import requests import random import time import pandas as pd from bs4 import BeautifulSoup from tqdm import tqdm from seleniumbase import Driver from seleniumbase import page_actions import undetected_chromedriver as webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys import time import pandas as pd BASE_URL = "https://www.zoopla.co.uk/property/uprn/{uprn}/" def initialize_driver(): driver = Driver(headless=True, uc=True) # Set headless to True if you want headless mode return driver def app(): # Read in the starting asset list asset_list = pd.read_excel("portfolio_epc_data_50m 28th May.xlsx") asset_list = asset_list[["UPRN", "ADDRESS", "POSTCODE"]] # asset_list.to_excel("property value.xlsx", index=False) # Generate the list of urls urls = [BASE_URL.format(uprn=uprn) for uprn in asset_list["UPRN"]] driver = webdriver.Chrome() driver = initialize_driver() driver.set_page_load_timeout(30) # Increase page load timeout result = [] for i, (url, uprn) in tqdm(enumerate(zip(urls, asset_list["UPRN"].tolist())), total=len(urls)): # Every 10 requests sleep for an extra 7 seconds if len(result) % 10 == 0 and len(result) != 0: time.sleep(7) try: driver.get(url) page_actions.wait_for_element_visible(driver, "p[data-testid='estimate-blurred']", timeout=30) price_element = driver.find_element("css selector", "p[data-testid='estimate-blurred']") price = price_element.get_text(strip=True) low_price_element = driver.find_element("css selector", "span[data-testid='low-estimate-blurred']") low_price = low_price_element.get_text(strip=True) high_price_element = driver.find_element("css selector", "span[data-testid='high-estimate-blurred']") high_price = high_price_element.get_text(strip=True) result.append( { "UPRN": uprn, "price": price, "lower_estimate": low_price, "upper_estimate": high_price } ) # Sleep a random amount of time between 5 and 20 seconds sleep_time = 5 + (15 * random.random()) time.sleep(sleep_time) except Exception as e: print(f"Failed to retrieve data for UPRN {uprn} at iteration {i}: {e}") # Store the result depending on where we are savepoint = pd.DataFrame(result) savepoint.to_csv(f"savepoint_index_{i}.csv", index=False) # TODO: Testing Jina AI - didn't work but maybe one of the alternatives might work: # https://www.youtube.com/watch?v=QxHE4af5BQE response = requests.get("https://r.jina.ai/https://www.zoopla.co.uk/property/uprn/41222761/") response.text if __name__ == "__main__": app()