diff --git a/.idea/Model.iml b/.idea/Model.iml index c6561970..09f2e496 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..fb10c6b0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/app.py b/asset_list/app.py index bb5cb427..b832a3e8 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -298,13 +298,13 @@ def app(): landlord_block_reference = None # Project from Nick - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/Sep2025 Project" - data_filename = "AL Test.xlsx" - sheet_name = "Sheet1" - postcode_column = 'postcode' + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio" + data_filename = "22.10 AL Portfolio.xlsx" + sheet_name = "22.10 AL Portfolio" + postcode_column = 'Postcode' address1_column = None address1_method = 'house_number_extraction' - fulladdress_column = "address" + fulladdress_column = "Address" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None @@ -315,7 +315,7 @@ def app(): landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "row_id" + landlord_property_id = "Row ID" landlord_sap = None outcomes_filename = None outcomes_sheetname = None diff --git a/etl/webscrape/Zoopla.py b/etl/webscrape/Zoopla.py index 7b3fd5b6..2c446dc8 100644 --- a/etl/webscrape/Zoopla.py +++ b/etl/webscrape/Zoopla.py @@ -3,109 +3,126 @@ import pandas as pd import time from stealth_requests import StealthSession import random +import os from multiprocessing import Pool from tqdm import tqdm ENGINES = ["safari", "chrome"] +CACHE_DIR = "zoopla_cache" +os.makedirs(CACHE_DIR, exist_ok=True) + + +def random_delay(): + """Pause randomly between requests (0.5–2 s).""" + time.sleep(random.uniform(0.5, 2)) def scrape_all_estimates(session, url): - # Rotate impersonation per request - resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)]) + """Scrape valuation estimates for one Zoopla property URL.""" + resp = session.get(url, impersonate=random.choice(ENGINES)) page_source = BeautifulSoup(resp.text, "html.parser") estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) is_blocked = len(estimates) == 0 - return estimates, is_blocked + return estimates, is_blocked, resp.text + + +def extract_estimates(estimates): + """Extract low, mid, and high estimates from parsed HTML.""" + est = estimates[0] + low = est.find("span", {"data-testid": "low-estimate-blurred"}).text + mid = est.find("p", {"data-testid": "estimate-blurred"}).text + high = est.find("span", {"data-testid": "high-estimate-blurred"}).text + return low, mid, high + + +def cache_path_for_url(url): + """Return a deterministic local cache path for a URL.""" + uprn = url.split("/")[-2] + return os.path.join(CACHE_DIR, f"{uprn}.html") def parallel_task(url): - # No impersonate argument here + """Main worker function executed in each process.""" + cache_path = cache_path_for_url(url) + + # Use cached file if it exists + if os.path.exists(cache_path): + html = open(cache_path, "r").read() + page_source = BeautifulSoup(html, "html.parser") + estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) + if estimates: + low, mid, high = extract_estimates(estimates) + return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high} + + # Otherwise scrape live with StealthSession() as session: - estimates, is_blocked = scrape_all_estimates(session, url) + attempts = 0 + while attempts < 5: + estimates, is_blocked, html = scrape_all_estimates(session, url) + if not is_blocked and estimates: + open(cache_path, "w").write(html) + low, mid, high = extract_estimates(estimates) + return {"URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high} + attempts += 1 + print(f"[Attempt {attempts}] Blocked or empty for {url}") + random_delay() - while is_blocked: - print(f"Blocked by Zoopla for URL: {url}") - time.sleep(random.uniform(0, 1)) - estimates, is_blocked = scrape_all_estimates(session, url) - - low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text - middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text - high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text - - return { - "URL": url, - "Low Estimate": low_estimate, - "Middle Estimate": middle_estimate, - "High Estimate": high_estimate, - } + # If still blocked, return placeholders + return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None} def parse_price(p): + if p is None: + return None + p = p.replace("£", "").strip().lower() + if not p: + return None if p.endswith("k"): - return float(p[:-1]) * 1000 + return float(p[:-1]) * 1_000 elif p.endswith("m"): return float(p[:-1]) * 1_000_000 else: - return float(p) - - -# def parallel_task(url): -# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session: -# estimates, is_blocked = scrape_all_estimates(session, url) -# -# while is_blocked: -# # Will need to wait and retry if blocked by Zoopla -# print(f"Blocked by Zoopla for URL: {url}") -# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection -# time.sleep(sleep_factor * 1) -# estimates, is_blocked = scrape_all_estimates(session, url) -# -# low_estimate = ( -# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text -# ) # Find all span elements with data-testid="low-estimate" -# middle_estimate = ( -# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text -# ) # Find all span elements with data-testid="middle-estimate" -# high_estimate = ( -# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text -# ) # Find all span elements with data-testid="high-estimate-blurred" -# -# return { -# "URL": url, -# "Low Estimate": low_estimate, -# "Middle Estimate": middle_estimate, -# "High Estimate": high_estimate, -# } + try: + return float(p.replace(",", "")) + except ValueError: + return None if __name__ == "__main__": - # Get a SAL + # Load portfolio asset_list = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - " - "Standardised.xlsx", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - " + "Standardised - partial UPRN fill.xlsx", sheet_name="Standardised Asset List" ) + asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])] asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str) uprns = asset_list["epc_os_uprn"].tolist() urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns] - with Pool(processes=5) as pool: + # Limit concurrency to avoid blocks + with Pool(processes=2) as pool: # fewer processes = fewer fingerprints estimates_list = list( - tqdm( - pool.imap(parallel_task, urls), - total=len(urls), - ) + tqdm(pool.imap(parallel_task, urls), total=len(urls)) ) df = pd.DataFrame(estimates_list) - # Extract UPRN from URL df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/") df["valuation"] = df["Middle Estimate"].apply(parse_price) + df.to_csv("zoopla_estimates.csv", index=False) - df["uprn"] = df["uprn"].astype(int).astype(str) - - asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel( - "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False + # Merge with asset list + merged = asset_list.merge( + df[["uprn", "valuation"]], + left_on="epc_os_uprn", + right_on="uprn", + how="left" ) + merged.to_excel( + "20251029 AL Portfolio - Standardised - with valuations.xlsx", + index=False + ) + + print("Done. Results saved.") diff --git a/recommendations/optimiser/funding_optimiser.py b/recommendations/optimiser/funding_optimiser.py index 4da08587..5acdd5fd 100644 --- a/recommendations/optimiser/funding_optimiser.py +++ b/recommendations/optimiser/funding_optimiser.py @@ -427,8 +427,9 @@ def optimise_with_funding_paths(p, input_measures, housing_type, funding: Fundin solutions["meets_upgrade_target"] = solutions["total_gain"] >= target_gain - 0.1 # If we have packages that are fundable, but do not meet the upgrade target, we can run a final optimisation pass - if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty: - logger.info("We have some packages that are fundable but do not meet the target gain") + # Turned off logging - too noisy + # if not solutions[solutions["is_eligible"] & ~solutions["meets_upgrade_target"]].empty: + # logger.info("We have some packages that are fundable but do not meet the target gain") # We now can calculate the project ABS, which subtracts from the cost, but this is only relevant for ECO4 solutions["starting_sap"] = p.data["current-energy-efficiency"]