From 93723697a18aeed93ef9d784fae9fff477cf62e8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 27 Oct 2025 15:27:32 +0000 Subject: [PATCH] allow no valuation and work with relative --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- asset_list/app.py | 110 ++++++++++++++++++++++++++- backend/SearchEpc.py | 5 +- backend/engine/engine.py | 46 +++++++++--- backend/ml_models/Valuation.py | 15 +++- etl/webscrape/Zoopla.py | 133 +++++++++++++++++++++++++-------- etl/webscrape/requirements.txt | 5 ++ 8 files changed, 265 insertions(+), 53 deletions(-) create mode 100644 etl/webscrape/requirements.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index 09f2e496..c6561970 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/asset_list/app.py b/asset_list/app.py index 2903e083..20cf04f1 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -59,9 +59,111 @@ def app(): Property UPRN """ + # + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cambridge/" + data_filename = "22.10_Cambridge_west addresses.xlsx" + sheet_name = "Asset List" + postcode_column = 'Postcode' + address1_column = None + address1_method = "house_number_extraction" + fulladdress_column = "Full Address" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "id" + landlord_sap = None + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = None + + # Property Box + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box" + data_filename = "Property Box Finance Portfolio.xlsx" + sheet_name = "Sheet1" + postcode_column = 'Postcode' + address1_column = None + address1_method = "house_number_extraction" + fulladdress_column = "Address 1" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "row_id" + landlord_sap = None + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = "block_id" + + # CDS - able-to-pay + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/CDS/Able to pay" + data_filename = "CDS_ASSET LIST_(2314).xlsx" + sheet_name = "Sheet1" + postcode_column = 'Property Address - Postcode' + address1_column = "Property Address - Line 1" + address1_method = None + fulladdress_column = "Property Address - Line 1" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "row_id" + landlord_sap = None + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = None + # Hyde - solar data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Solar" - data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx" + data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx" sheet_name = "Electric Property Inspections" postcode_column = 'Postcode' address1_column = None # Is only patchily populated so we create it @@ -88,14 +190,14 @@ def app(): master_filepaths = [] master_id_colnames = [] master_to_asset_list_filepath = None - phase = True + phase = False ecosurv_landlords = None asset_list_header = 0 landlord_block_reference = None # Hyde cavity data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Cavity" - data_filename = "Domna Property Analysis HYDE (Chichester Removed).xlsx" + data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx" sheet_name = "Cavity Inspections" postcode_column = 'Postcode' address1_column = None # Is only patchily populated so we create it @@ -122,7 +224,7 @@ def app(): master_filepaths = [] master_id_colnames = [] master_to_asset_list_filepath = None - phase = True + phase = False ecosurv_landlords = None asset_list_header = 0 landlord_block_reference = None diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 16dd8f04..1a14e87a 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -347,7 +347,8 @@ class SearchEpc: # We update the data with the correct uprn if self.uprn: for x in api_response["response"]["rows"]: - x["uprn"] = self.uprn + if pd.isnull(x["uprn"]): + x["uprn"] = self.uprn data["rows"].extend(api_response["response"]["rows"]) @@ -357,6 +358,8 @@ class SearchEpc: row for row in data["rows"] if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) ] + # Overwrite the data + self.data = data if data["rows"]: api_response["msg"] = self.SUCCESS diff --git a/backend/engine/engine.py b/backend/engine/engine.py index fa1f191c..f2674290 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -145,14 +145,17 @@ def extract_portfolio_aggregation_data( cost = sum([r["total"] for r in default_recommendations]) sap_point_improvement = sum([r["sap_points"] for r in default_recommendations]) - lower_bound_valuation_uplift = ( - property_value_increase_ranges[p.id]["lower_bound_increased_value"] - - property_value_increase_ranges[p.id]["current_value"] - ) - upper_bound_valuation_uplift = ( - property_value_increase_ranges[p.id]["upper_bound_increased_value"] - - property_value_increase_ranges[p.id]["current_value"] - ) + if not pd.isnull(property_value_increase_ranges[p.id]["current_value"]): + lower_bound_valuation_uplift = ( + property_value_increase_ranges[p.id]["lower_bound_increased_value"] - + property_value_increase_ranges[p.id]["current_value"] + ) + upper_bound_valuation_uplift = ( + property_value_increase_ranges[p.id]["upper_bound_increased_value"] - + property_value_increase_ranges[p.id]["current_value"] + ) + else: + lower_bound_valuation_uplift, upper_bound_valuation_uplift = 0, 0 agg_data.append({ "pre_retrofit_epc": p.data["current-energy-rating"], @@ -523,6 +526,7 @@ async def model_engine(body: PlanTriggerRequest): plan_input["built_form"] = plan_input["built_form"].map(built_form_map) plan_input = plan_input.to_dict("records") + else: raise ValueError("Other formats not yet supported") @@ -549,6 +553,13 @@ async def model_engine(body: PlanTriggerRequest): # If we have patches or overrides, we should read them in here patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body) + if body.file_type == "xlsx" and body.file_format == "domna_asset_list": + # We check if we have valution data + if not valuation_data and body.valuation_file_path in [None, ""]: + # We check plan_input + if "domna_valuation" in plan_input[0]: + valuation_data = [{"uprn": x["uprn"], "valuation": x["domna_valuation"]} for x in plan_input] + cleaning_data = read_dataframe_from_s3_parquet( bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", ) @@ -563,12 +574,22 @@ async def model_engine(body: PlanTriggerRequest): if uprn: uprn = int(float(uprn)) + address1 = config.get("address", None) + # Handle domna address list format + if pd.isnull(address1) and body.file_format == "domna_asset_list": + address1 = config.get("domna_full_address", None) + + address1 = str(int(address1)) if isinstance(address1, float) else str(address1) + + full_address = config["domna_full_address"] if body.file_format == "domna_asset_list" else None + epc_searcher = SearchEpc( - address1=str(config["address"]), + address1=address1, postcode=config["postcode"], uprn=uprn, auth_token=get_settings().EPC_AUTH_TOKEN, os_api_key="", + full_address=full_address ) epc_searcher.ordnance_survey_client.built_form = config.get("built_form", None) epc_searcher.ordnance_survey_client.property_type = config.get("property_type", None) @@ -1176,9 +1197,10 @@ async def model_engine(body: PlanTriggerRequest): upload_funding(session, p, new_plan_id, recommendations_to_upload) - property_valuation_increases.append( - valuations["average_increased_value"] - valuations["current_value"] - ) + if valuations["current_value"] > 0: + property_valuation_increases.append( + valuations["average_increased_value"] - valuations["current_value"] + ) # Commit the session after each batch session.commit() diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index 8c57900f..17db0dae 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -219,12 +219,19 @@ class PropertyValuation: current_epc = property_instance.data["current-energy-rating"] if not current_value: + # In this case, we return a % improvement rather than an absolute + relative_improvement = cls.estimate_valuation_improvement( + current_value=1, + current_epc=current_epc, + target_epc=target_epc, + total_cost=1 + ) return { "current_value": 0, - "lower_bound_increased_value": 0, - "upper_bound_increased_value": 0, - "average_increased_value": 0, - "average_increase": 0 + "lower_bound_increased_value": relative_improvement["lower_bound_increased_value"] - 1, + "upper_bound_increased_value": relative_improvement["upper_bound_increased_value"] - 1, + "average_increased_value": relative_improvement["average_increased_value"] - 1, + "average_increase": relative_improvement["average_increase"] } return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost) diff --git a/etl/webscrape/Zoopla.py b/etl/webscrape/Zoopla.py index bb86c759..7b3fd5b6 100644 --- a/etl/webscrape/Zoopla.py +++ b/etl/webscrape/Zoopla.py @@ -1,38 +1,111 @@ -# Initial Code - -from seleniumbase import SB +from bs4 import BeautifulSoup +import pandas as pd import time +from stealth_requests import StealthSession +import random +from multiprocessing import Pool +from tqdm import tqdm -uprns = [ - 100071297618, - 100080893397, - 100060778033, - 200004793081, - 100071265143, - 100071297618, - 100080893397, - 100060778033, - 200004793081, - 100071265143, -] +ENGINES = ["safari", "chrome"] -estimate_list = [] -for uprn in uprns: +def scrape_all_estimates(session, url): + # Rotate impersonation per request + resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)]) + page_source = BeautifulSoup(resp.text, "html.parser") + estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) + is_blocked = len(estimates) == 0 + return estimates, is_blocked - # Probably can change the timings here - time.sleep(5) - with SB(uc=True) as sb: - sb.uc_open_with_reconnect( - f"https://www.zoopla.co.uk/property/uprn/{uprn}/", - 3, + +def parallel_task(url): + # No impersonate argument here + with StealthSession() as session: + estimates, is_blocked = scrape_all_estimates(session, url) + + while is_blocked: + print(f"Blocked by Zoopla for URL: {url}") + time.sleep(random.uniform(0, 1)) + estimates, is_blocked = scrape_all_estimates(session, url) + + low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text + middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text + high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text + + return { + "URL": url, + "Low Estimate": low_estimate, + "Middle Estimate": middle_estimate, + "High Estimate": high_estimate, + } + + +def parse_price(p): + p = p.replace("£", "").strip().lower() + if p.endswith("k"): + return float(p[:-1]) * 1000 + elif p.endswith("m"): + return float(p[:-1]) * 1_000_000 + else: + return float(p) + + +# def parallel_task(url): +# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session: +# estimates, is_blocked = scrape_all_estimates(session, url) +# +# while is_blocked: +# # Will need to wait and retry if blocked by Zoopla +# print(f"Blocked by Zoopla for URL: {url}") +# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection +# time.sleep(sleep_factor * 1) +# estimates, is_blocked = scrape_all_estimates(session, url) +# +# low_estimate = ( +# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text +# ) # Find all span elements with data-testid="low-estimate" +# middle_estimate = ( +# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text +# ) # Find all span elements with data-testid="middle-estimate" +# high_estimate = ( +# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text +# ) # Find all span elements with data-testid="high-estimate-blurred" +# +# return { +# "URL": url, +# "Low Estimate": low_estimate, +# "Middle Estimate": middle_estimate, +# "High Estimate": high_estimate, +# } + + +if __name__ == "__main__": + # Get a SAL + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - " + "Standardised.xlsx", + sheet_name="Standardised Asset List" + ) + asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str) + uprns = asset_list["epc_os_uprn"].tolist() + urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns] + + with Pool(processes=5) as pool: + estimates_list = list( + tqdm( + pool.imap(parallel_task, urls), + total=len(urls), + ) ) - soup = sb.get_beautiful_soup() + df = pd.DataFrame(estimates_list) + # Extract UPRN from URL + df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/") + df["valuation"] = df["Middle Estimate"].apply(parse_price) + df.to_csv("zoopla_estimates.csv", index=False) - estimates = soup.find_all("div", {"data-testid": "sale-estimate"}) - # Can change the way we extract the text here - estimate_text = ( - estimates[-1].find_all("p")[-1].find_all("span")[-1]["aria-label"] - ) - estimate_list.append(estimate_text) + df["uprn"] = df["uprn"].astype(int).astype(str) + + asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel( + "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False + ) diff --git a/etl/webscrape/requirements.txt b/etl/webscrape/requirements.txt new file mode 100644 index 00000000..4027a224 --- /dev/null +++ b/etl/webscrape/requirements.txt @@ -0,0 +1,5 @@ +beautifulsoup4>=4.12.0 +pandas>=2.0.0 +stealth-requests>=1.0.7 +tqdm>=4.65.0 +openpyxl \ No newline at end of file