mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
88 lines
2.9 KiB
Python
88 lines
2.9 KiB
Python
import requests
|
|
import random
|
|
import time
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
from tqdm import tqdm
|
|
from seleniumbase import Driver
|
|
from seleniumbase import page_actions
|
|
|
|
import undetected_chromedriver as webdriver
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.common.keys import Keys
|
|
import time
|
|
import pandas as pd
|
|
|
|
BASE_URL = "https://www.zoopla.co.uk/property/uprn/{uprn}/"
|
|
|
|
|
|
def initialize_driver():
|
|
driver = Driver(headless=True, uc=True) # Set headless to True if you want headless mode
|
|
return driver
|
|
|
|
|
|
def app():
|
|
# Read in the starting asset list
|
|
asset_list = pd.read_excel("portfolio_epc_data_50m 28th May.xlsx")
|
|
asset_list = asset_list[["UPRN", "ADDRESS", "POSTCODE"]]
|
|
|
|
# asset_list.to_excel("property value.xlsx", index=False)
|
|
|
|
# Generate the list of urls
|
|
urls = [BASE_URL.format(uprn=uprn) for uprn in asset_list["UPRN"]]
|
|
|
|
driver = webdriver.Chrome()
|
|
|
|
driver = initialize_driver()
|
|
driver.set_page_load_timeout(30) # Increase page load timeout
|
|
|
|
result = []
|
|
for i, (url, uprn) in tqdm(enumerate(zip(urls, asset_list["UPRN"].tolist())), total=len(urls)):
|
|
|
|
# Every 10 requests sleep for an extra 7 seconds
|
|
if len(result) % 10 == 0 and len(result) != 0:
|
|
time.sleep(7)
|
|
|
|
try:
|
|
|
|
driver.get(url)
|
|
page_actions.wait_for_element_visible(driver, "p[data-testid='estimate-blurred']", timeout=30)
|
|
|
|
price_element = driver.find_element("css selector", "p[data-testid='estimate-blurred']")
|
|
price = price_element.get_text(strip=True)
|
|
|
|
low_price_element = driver.find_element("css selector", "span[data-testid='low-estimate-blurred']")
|
|
low_price = low_price_element.get_text(strip=True)
|
|
|
|
high_price_element = driver.find_element("css selector", "span[data-testid='high-estimate-blurred']")
|
|
high_price = high_price_element.get_text(strip=True)
|
|
|
|
result.append(
|
|
{
|
|
"UPRN": uprn,
|
|
"price": price,
|
|
"lower_estimate": low_price,
|
|
"upper_estimate": high_price
|
|
}
|
|
)
|
|
|
|
# Sleep a random amount of time between 5 and 20 seconds
|
|
sleep_time = 5 + (15 * random.random())
|
|
time.sleep(sleep_time)
|
|
|
|
except Exception as e:
|
|
print(f"Failed to retrieve data for UPRN {uprn} at iteration {i}: {e}")
|
|
|
|
# Store the result depending on where we are
|
|
savepoint = pd.DataFrame(result)
|
|
savepoint.to_csv(f"savepoint_index_{i}.csv", index=False)
|
|
|
|
# TODO: Testing Jina AI - didn't work but maybe one of the alternatives might work:
|
|
# https://www.youtube.com/watch?v=QxHE4af5BQE
|
|
response = requests.get("https://r.jina.ai/https://www.zoopla.co.uk/property/uprn/41222761/")
|
|
response.text
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|