Model/etl/property_valuation/scrape_valuations.py
2024-05-30 13:38:26 +01:00

88 lines
2.9 KiB
Python

import requests
import random
import time
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from seleniumbase import Driver
from seleniumbase import page_actions
import undetected_chromedriver as webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
BASE_URL = "https://www.zoopla.co.uk/property/uprn/{uprn}/"
def initialize_driver():
driver = Driver(headless=True, uc=True) # Set headless to True if you want headless mode
return driver
def app():
# Read in the starting asset list
asset_list = pd.read_excel("portfolio_epc_data_50m 28th May.xlsx")
asset_list = asset_list[["UPRN", "ADDRESS", "POSTCODE"]]
# asset_list.to_excel("property value.xlsx", index=False)
# Generate the list of urls
urls = [BASE_URL.format(uprn=uprn) for uprn in asset_list["UPRN"]]
driver = webdriver.Chrome()
driver = initialize_driver()
driver.set_page_load_timeout(30) # Increase page load timeout
result = []
for i, (url, uprn) in tqdm(enumerate(zip(urls, asset_list["UPRN"].tolist())), total=len(urls)):
# Every 10 requests sleep for an extra 7 seconds
if len(result) % 10 == 0 and len(result) != 0:
time.sleep(7)
try:
driver.get(url)
page_actions.wait_for_element_visible(driver, "p[data-testid='estimate-blurred']", timeout=30)
price_element = driver.find_element("css selector", "p[data-testid='estimate-blurred']")
price = price_element.get_text(strip=True)
low_price_element = driver.find_element("css selector", "span[data-testid='low-estimate-blurred']")
low_price = low_price_element.get_text(strip=True)
high_price_element = driver.find_element("css selector", "span[data-testid='high-estimate-blurred']")
high_price = high_price_element.get_text(strip=True)
result.append(
{
"UPRN": uprn,
"price": price,
"lower_estimate": low_price,
"upper_estimate": high_price
}
)
# Sleep a random amount of time between 5 and 20 seconds
sleep_time = 5 + (15 * random.random())
time.sleep(sleep_time)
except Exception as e:
print(f"Failed to retrieve data for UPRN {uprn} at iteration {i}: {e}")
# Store the result depending on where we are
savepoint = pd.DataFrame(result)
savepoint.to_csv(f"savepoint_index_{i}.csv", index=False)
# TODO: Testing Jina AI - didn't work but maybe one of the alternatives might work:
# https://www.youtube.com/watch?v=QxHE4af5BQE
response = requests.get("https://r.jina.ai/https://www.zoopla.co.uk/property/uprn/41222761/")
response.text
if __name__ == "__main__":
app()