mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
improving valuations scraper
This commit is contained in:
parent
27de54adef
commit
9c5d68f55f
5 changed files with 92 additions and 74 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -298,13 +298,13 @@ def app():
|
|||
landlord_block_reference = None
|
||||
|
||||
# Project from Nick
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/Sep2025 Project"
|
||||
data_filename = "AL Test.xlsx"
|
||||
sheet_name = "Sheet1"
|
||||
postcode_column = 'postcode'
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio"
|
||||
data_filename = "22.10 AL Portfolio.xlsx"
|
||||
sheet_name = "22.10 AL Portfolio"
|
||||
postcode_column = 'Postcode'
|
||||
address1_column = None
|
||||
address1_method = 'house_number_extraction'
|
||||
fulladdress_column = "address"
|
||||
fulladdress_column = "Address"
|
||||
address_cols_to_concat = []
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = None
|
||||
|
|
@ -315,7 +315,7 @@ def app():
|
|||
landlord_roof_construction = None
|
||||
landlord_heating_system = None
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "row_id"
|
||||
landlord_property_id = "Row ID"
|
||||
landlord_sap = None
|
||||
outcomes_filename = None
|
||||
outcomes_sheetname = None
|
||||
|
|
|
|||
|
|
@ -3,109 +3,126 @@ import pandas as pd
|
|||
import time
|
||||
from stealth_requests import StealthSession
|
||||
import random
|
||||
import os
|
||||
from multiprocessing import Pool
|
||||
from tqdm import tqdm
|
||||
|
||||
ENGINES = ["safari", "chrome"]
|
||||
CACHE_DIR = "zoopla_cache"
|
||||
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||
|
||||
|
||||
def random_delay():
|
||||
"""Pause randomly between requests (0.5–2 s)."""
|
||||
time.sleep(random.uniform(0.5, 2))
|
||||
|
||||
|
||||
def scrape_all_estimates(session, url):
|
||||
# Rotate impersonation per request
|
||||
resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)])
|
||||
"""Scrape valuation estimates for one Zoopla property URL."""
|
||||
resp = session.get(url, impersonate=random.choice(ENGINES))
|
||||
page_source = BeautifulSoup(resp.text, "html.parser")
|
||||
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
|
||||
is_blocked = len(estimates) == 0
|
||||
return estimates, is_blocked
|
||||
return estimates, is_blocked, resp.text
|
||||
|
||||
|
||||
def extract_estimates(estimates):
|
||||
"""Extract low, mid, and high estimates from parsed HTML."""
|
||||
est = estimates[0]
|
||||
low = est.find("span", {"data-testid": "low-estimate-blurred"}).text
|
||||
mid = est.find("p", {"data-testid": "estimate-blurred"}).text
|
||||
high = est.find("span", {"data-testid": "high-estimate-blurred"}).text
|
||||
return low, mid, high
|
||||
|
||||
|
||||
def cache_path_for_url(url):
|
||||
"""Return a deterministic local cache path for a URL."""
|
||||
uprn = url.split("/")[-2]
|
||||
return os.path.join(CACHE_DIR, f"{uprn}.html")
|
||||
|
||||
|
||||
def parallel_task(url):
|
||||
# No impersonate argument here
|
||||
"""Main worker function executed in each process."""
|
||||
cache_path = cache_path_for_url(url)
|
||||
|
||||
# Use cached file if it exists
|
||||
if os.path.exists(cache_path):
|
||||
html = open(cache_path, "r").read()
|
||||
page_source = BeautifulSoup(html, "html.parser")
|
||||
estimates = page_source.find_all("div", {"data-testid": "sale-estimate"})
|
||||
if estimates:
|
||||
low, mid, high = extract_estimates(estimates)
|
||||
return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
|
||||
|
||||
# Otherwise scrape live
|
||||
with StealthSession() as session:
|
||||
estimates, is_blocked = scrape_all_estimates(session, url)
|
||||
attempts = 0
|
||||
while attempts < 5:
|
||||
estimates, is_blocked, html = scrape_all_estimates(session, url)
|
||||
if not is_blocked and estimates:
|
||||
open(cache_path, "w").write(html)
|
||||
low, mid, high = extract_estimates(estimates)
|
||||
return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high}
|
||||
attempts += 1
|
||||
print(f"[Attempt {attempts}] Blocked or empty for {url}")
|
||||
random_delay()
|
||||
|
||||
while is_blocked:
|
||||
print(f"Blocked by Zoopla for URL: {url}")
|
||||
time.sleep(random.uniform(0, 1))
|
||||
estimates, is_blocked = scrape_all_estimates(session, url)
|
||||
|
||||
low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
|
||||
middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
|
||||
high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
|
||||
|
||||
return {
|
||||
"URL": url,
|
||||
"Low Estimate": low_estimate,
|
||||
"Middle Estimate": middle_estimate,
|
||||
"High Estimate": high_estimate,
|
||||
}
|
||||
# If still blocked, return placeholders
|
||||
return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None}
|
||||
|
||||
|
||||
def parse_price(p):
|
||||
if p is None:
|
||||
return None
|
||||
|
||||
p = p.replace("£", "").strip().lower()
|
||||
if not p:
|
||||
return None
|
||||
if p.endswith("k"):
|
||||
return float(p[:-1]) * 1000
|
||||
return float(p[:-1]) * 1_000
|
||||
elif p.endswith("m"):
|
||||
return float(p[:-1]) * 1_000_000
|
||||
else:
|
||||
return float(p)
|
||||
|
||||
|
||||
# def parallel_task(url):
|
||||
# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session:
|
||||
# estimates, is_blocked = scrape_all_estimates(session, url)
|
||||
#
|
||||
# while is_blocked:
|
||||
# # Will need to wait and retry if blocked by Zoopla
|
||||
# print(f"Blocked by Zoopla for URL: {url}")
|
||||
# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection
|
||||
# time.sleep(sleep_factor * 1)
|
||||
# estimates, is_blocked = scrape_all_estimates(session, url)
|
||||
#
|
||||
# low_estimate = (
|
||||
# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text
|
||||
# ) # Find all span elements with data-testid="low-estimate"
|
||||
# middle_estimate = (
|
||||
# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text
|
||||
# ) # Find all span elements with data-testid="middle-estimate"
|
||||
# high_estimate = (
|
||||
# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text
|
||||
# ) # Find all span elements with data-testid="high-estimate-blurred"
|
||||
#
|
||||
# return {
|
||||
# "URL": url,
|
||||
# "Low Estimate": low_estimate,
|
||||
# "Middle Estimate": middle_estimate,
|
||||
# "High Estimate": high_estimate,
|
||||
# }
|
||||
try:
|
||||
return float(p.replace(",", ""))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Get a SAL
|
||||
# Load portfolio
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - "
|
||||
"Standardised.xlsx",
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - "
|
||||
"Standardised - partial UPRN fill.xlsx",
|
||||
sheet_name="Standardised Asset List"
|
||||
)
|
||||
asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])]
|
||||
asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str)
|
||||
uprns = asset_list["epc_os_uprn"].tolist()
|
||||
urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns]
|
||||
|
||||
with Pool(processes=5) as pool:
|
||||
# Limit concurrency to avoid blocks
|
||||
with Pool(processes=2) as pool: # fewer processes = fewer fingerprints
|
||||
estimates_list = list(
|
||||
tqdm(
|
||||
pool.imap(parallel_task, urls),
|
||||
total=len(urls),
|
||||
)
|
||||
tqdm(pool.imap(parallel_task, urls), total=len(urls))
|
||||
)
|
||||
|
||||
df = pd.DataFrame(estimates_list)
|
||||
# Extract UPRN from URL
|
||||
df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/")
|
||||
df["valuation"] = df["Middle Estimate"].apply(parse_price)
|
||||
|
||||
df.to_csv("zoopla_estimates.csv", index=False)
|
||||
|
||||
df["uprn"] = df["uprn"].astype(int).astype(str)
|
||||
|
||||
asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel(
|
||||
"Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False
|
||||
# Merge with asset list
|
||||
merged = asset_list.merge(
|
||||
df[["uprn", "valuation"]],
|
||||
left_on="epc_os_uprn",
|
||||
right_on="uprn",
|
||||
how="left"
|
||||
)
|
||||
merged.to_excel(
|
||||
"20251029 AL Portfolio - Standardised - with valuations.xlsx",
|
||||
index=False
|
||||
)
|
||||
|
||||
print("Done. Results saved.")
|
||||
|
|
|
|||
|
|
@ -427,8 +427,9 @@ def optimise_with_funding_paths(p, input_measures, housing_type, funding: Fundin
|
|||
solutions["meets_upgrade_target"] = solutions["total_gain"] >= target_gain - 0.1
|
||||
|
||||
# If we have packages that are fundable, but do not meet the upgrade target, we can run a final optimisation pass
|
||||
if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty:
|
||||
logger.info("We have some packages that are fundable but do not meet the target gain")
|
||||
# Turned off logging - too noisy
|
||||
# if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty:
|
||||
# logger.info("We have some packages that are fundable but do not meet the target gain")
|
||||
|
||||
# We now can calculate the project ABS, which subtracts from the cost, but this is only relevant for ECO4
|
||||
solutions["starting_sap"] = p.data["current-energy-efficiency"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue