improving valuations scraper

This commit is contained in:
Khalim Conn-Kowlessar 2025-10-29 20:26:45 +00:00
parent 27de54adef
commit 9c5d68f55f
5 changed files with 92 additions and 74 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>

View file

@ -298,13 +298,13 @@ def app():
landlord_block_reference = None
# Project from Nick
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/Sep2025 Project"
data_filename = "AL Test.xlsx"
sheet_name = "Sheet1"
postcode_column = 'postcode'
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio"
data_filename = "22.10 AL Portfolio.xlsx"
sheet_name = "22.10 AL Portfolio"
postcode_column = 'Postcode'
address1_column = None
address1_method = 'house_number_extraction'
fulladdress_column = "address"
fulladdress_column = "Address"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = None
@ -315,7 +315,7 @@ def app():
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "row_id"
landlord_property_id = "Row ID"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None

View file

@ -3,109 +3,126 @@ import pandas as pd
import time
from stealth_requests import StealthSession
import random
import os
from multiprocessing import Pool
from tqdm import tqdm
ENGINES = ["safari", "chrome"]
CACHE_DIR = "zoopla_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
def random_delay():
"""Pause randomly between requests (0.52 s)."""
time.sleep(random.uniform(0.5, 2))
def scrape_all_estimates(session, url):
# Rotate impersonation per request
resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
"""Scrape valuation estimates for one Zoopla property URL."""
resp = session.get(url, impersonate=random.choice(ENGINES))
page_source = BeautifulSoup(resp.text, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
is_blocked = len(estimates) == 0
return estimates, is_blocked
return estimates, is_blocked, resp.text
def extract_estimates(estimates):
"""Extract low, mid, and high estimates from parsed HTML."""
est = estimates[0]
low = est.find("span", {"data-testid": "low-estimate-blurred"}).text
mid = est.find("p", {"data-testid": "estimate-blurred"}).text
high = est.find("span", {"data-testid": "high-estimate-blurred"}).text
return low, mid, high
def cache_path_for_url(url):
"""Return a deterministic local cache path for a URL."""
uprn = url.split("/")[-2]
return os.path.join(CACHE_DIR, f"{uprn}.html")
def parallel_task(url):
# No impersonate argument here
"""Main worker function executed in each process."""
cache_path = cache_path_for_url(url)
# Use cached file if it exists
if os.path.exists(cache_path):
html = open(cache_path, "r").read()
page_source = BeautifulSoup(html, "html.parser")
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
if estimates:
low, mid, high = extract_estimates(estimates)
return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
# Otherwise scrape live
with StealthSession() as session:
estimates, is_blocked = scrape_all_estimates(session, url)
attempts = 0
while attempts < 5:
estimates, is_blocked, html = scrape_all_estimates(session, url)
if not is_blocked and estimates:
open(cache_path, "w").write(html)
low, mid, high = extract_estimates(estimates)
return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
attempts += 1
print(f"[Attempt {attempts}] Blocked or empty for {url}")
random_delay()
while is_blocked:
print(f"Blocked by Zoopla for URL: {url}")
time.sleep(random.uniform(0, 1))
estimates, is_blocked = scrape_all_estimates(session, url)
low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
return {
"URL": url,
"Low Estimate": low_estimate,
"Middle Estimate": middle_estimate,
"High Estimate": high_estimate,
}
# If still blocked, return placeholders
return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None}
def parse_price(p):
if p is None:
return None
p = p.replace("£", "").strip().lower()
if not p:
return None
if p.endswith("k"):
return float(p[:-1]) * 1000
return float(p[:-1]) * 1_000
elif p.endswith("m"):
return float(p[:-1]) * 1_000_000
else:
return float(p)
# def parallel_task(url):
# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
# estimates, is_blocked = scrape_all_estimates(session, url)
#
# while is_blocked:
# # Will need to wait and retry if blocked by Zoopla
# print(f"Blocked by Zoopla for URL: {url}")
# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection
# time.sleep(sleep_factor * 1)
# estimates, is_blocked = scrape_all_estimates(session, url)
#
# low_estimate = (
# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
# ) # Find all span elements with data-testid="low-estimate"
# middle_estimate = (
# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
# ) # Find all span elements with data-testid="middle-estimate"
# high_estimate = (
# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
# ) # Find all span elements with data-testid="high-estimate-blurred"
#
# return {
# "URL": url,
# "Low Estimate": low_estimate,
# "Middle Estimate": middle_estimate,
# "High Estimate": high_estimate,
# }
try:
return float(p.replace(",", ""))
except ValueError:
return None
if __name__ == "__main__":
# Get a SAL
# Load portfolio
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
"Standardised.xlsx",
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - "
"Standardised - partial UPRN fill.xlsx",
sheet_name="Standardised Asset List"
)
asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])]
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
uprns = asset_list["epc_os_uprn"].tolist()
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
with Pool(processes=5) as pool:
# Limit concurrency to avoid blocks
with Pool(processes=2) as pool: # fewer processes = fewer fingerprints
estimates_list = list(
tqdm(
pool.imap(parallel_task, urls),
total=len(urls),
)
tqdm(pool.imap(parallel_task, urls), total=len(urls))
)
df = pd.DataFrame(estimates_list)
# Extract UPRN from URL
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
df["valuation"] = df["Middle Estimate"].apply(parse_price)
df.to_csv("zoopla_estimates.csv", index=False)
df["uprn"] = df["uprn"].astype(int).astype(str)
asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
"Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
# Merge with asset list
merged = asset_list.merge(
df[["uprn", "valuation"]],
left_on="epc_os_uprn",
right_on="uprn",
how="left"
)
merged.to_excel(
"20251029 AL Portfolio - Standardised - with valuations.xlsx",
index=False
)
print("Done. Results saved.")

View file

@ -427,8 +427,9 @@ def optimise_with_funding_paths(p, input_measures, housing_type, funding: Fundin
solutions["meets_upgrade_target"] = solutions["total_gain"] >= target_gain - 0.1
# If we have packages that are fundable, but do not meet the upgrade target, we can run a final optimisation pass
if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty:
logger.info("We have some packages that are fundable but do not meet the target gain")
# Turned off logging - too noisy
# if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty:
# logger.info("We have some packages that are fundable but do not meet the target gain")
# We now can calculate the project ABS, which subtracts from the cost, but this is only relevant for ECO4
solutions["starting_sap"] = p.data["current-energy-efficiency"]