mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
111 lines
3.9 KiB
Python
111 lines
3.9 KiB
Python
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import time
|
|
from stealth_requests import StealthSession
|
|
import random
|
|
from multiprocessing import Pool
|
|
from tqdm import tqdm
|
|
|
|
ENGINES = ["safari", "chrome"]
|
|
|
|
|
|
def scrape_all_estimates(session, url):
|
|
# Rotate impersonation per request
|
|
resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
|
|
page_source = BeautifulSoup(resp.text, "html.parser")
|
|
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
|
|
is_blocked = len(estimates) == 0
|
|
return estimates, is_blocked
|
|
|
|
|
|
def parallel_task(url):
|
|
# No impersonate argument here
|
|
with StealthSession() as session:
|
|
estimates, is_blocked = scrape_all_estimates(session, url)
|
|
|
|
while is_blocked:
|
|
print(f"Blocked by Zoopla for URL: {url}")
|
|
time.sleep(random.uniform(0, 1))
|
|
estimates, is_blocked = scrape_all_estimates(session, url)
|
|
|
|
low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
|
|
middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
|
|
high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
|
|
|
|
return {
|
|
"URL": url,
|
|
"Low Estimate": low_estimate,
|
|
"Middle Estimate": middle_estimate,
|
|
"High Estimate": high_estimate,
|
|
}
|
|
|
|
|
|
def parse_price(p):
|
|
p = p.replace("£", "").strip().lower()
|
|
if p.endswith("k"):
|
|
return float(p[:-1]) * 1000
|
|
elif p.endswith("m"):
|
|
return float(p[:-1]) * 1_000_000
|
|
else:
|
|
return float(p)
|
|
|
|
|
|
# def parallel_task(url):
|
|
# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
|
|
# estimates, is_blocked = scrape_all_estimates(session, url)
|
|
#
|
|
# while is_blocked:
|
|
# # Will need to wait and retry if blocked by Zoopla
|
|
# print(f"Blocked by Zoopla for URL: {url}")
|
|
# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection
|
|
# time.sleep(sleep_factor * 1)
|
|
# estimates, is_blocked = scrape_all_estimates(session, url)
|
|
#
|
|
# low_estimate = (
|
|
# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
|
|
# ) # Find all span elements with data-testid="low-estimate"
|
|
# middle_estimate = (
|
|
# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
|
|
# ) # Find all span elements with data-testid="middle-estimate"
|
|
# high_estimate = (
|
|
# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
|
|
# ) # Find all span elements with data-testid="high-estimate-blurred"
|
|
#
|
|
# return {
|
|
# "URL": url,
|
|
# "Low Estimate": low_estimate,
|
|
# "Middle Estimate": middle_estimate,
|
|
# "High Estimate": high_estimate,
|
|
# }
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Get a SAL
|
|
asset_list = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
|
|
"Standardised.xlsx",
|
|
sheet_name="Standardised Asset List"
|
|
)
|
|
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
|
|
uprns = asset_list["epc_os_uprn"].tolist()
|
|
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
|
|
|
|
with Pool(processes=5) as pool:
|
|
estimates_list = list(
|
|
tqdm(
|
|
pool.imap(parallel_task, urls),
|
|
total=len(urls),
|
|
)
|
|
)
|
|
|
|
df = pd.DataFrame(estimates_list)
|
|
# Extract UPRN from URL
|
|
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
|
|
df["valuation"] = df["Middle Estimate"].apply(parse_price)
|
|
df.to_csv("zoopla_estimates.csv", index=False)
|
|
|
|
df["uprn"] = df["uprn"].astype(int).astype(str)
|
|
|
|
asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
|
|
"Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
|
|
)
|