diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index dce929ae..da20432b 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -309,6 +309,17 @@ class AssetList: 'NAME OF SURVEYOR' ] + # Solar non-intrusive fields + NON_INTRUSIVES_SOLAR_COLNAMES = [ + 'PV, ACCESS ISSUE, SEE NOTES', 'ROOF ORIENTATION', + 'AREA (m²) OF ROOF WHERE PV WILL BE SITUATED ', 'SHADING', + 'Roof Tiles - CONCRETE/SLATE/ROSEMARY', + 'NO. OF PANELS (Typical size of 420W panel is 1mx1.7m and need 30cm all the way around panels)', + 'SCAFFOLD REQUIRED? IF YES, ARE THERE ANY SURROUNDING ACCESS ISSUES - PLEASE DESCRIBE', + 'IF PANELS ARE GOING ON REAR PLEASE CHECK FOR SPACE FOR SCAFFOLDING - DESCRIBE ANY ISSUES BELOW', + 'DATE', 'NAME OF SURVEYOR' + ] + NON_INTRUSIVES_ELIGIBILITY_COLUMN = "Eligibility (Red/Yellow/Green)" OLD_FORMAT_NON_INTRUSIVE_COLNAMES = ['WFT Findings', 'ECO Eligibility'] @@ -461,6 +472,8 @@ class AssetList: self.new_format_non_insturives_present_v2 = 'TILE HUNG' in self.raw_asset_list.columns + self.solar_non_intrusives_present = "AREA (m²) OF ROOF WHERE PV WILL BE SITUATED" in self.raw_asset_list.columns + # Names of columns self.landlord_property_id = landlord_property_id self.address1_colname = address1_colname @@ -774,6 +787,9 @@ class AssetList: if self.new_format_non_insturives_present_v2: non_intrusive_columns += self.NON_INTRUSIVES_NEW_FORMAT_COLNAMES_V2 + if self.solar_non_intrusives_present: + non_intrusive_columns += self.NON_INTRUSIVES_SOLAR_COLNAMES + if self.old_format_non_intrusives_present: # We check if we have the ECO Eligibility column, which we might not have non_intrusive_columns = [ @@ -946,7 +962,7 @@ class AssetList: if self.phase: # We filter on just the properties that have had an inspection - if self.new_format_non_insturives_present_v2: + if self.new_format_non_insturives_present_v2 or self.solar_non_intrusives_present: self.standardised_asset_list = self.standardised_asset_list[ ~self.standardised_asset_list['NAME OF SURVEYOR'].isin( ["YET TO BE SURVEYED", "", None] @@ -1341,10 +1357,10 @@ class AssetList: # for identifying cavity jobs if self.non_intrusives_present and not self.old_format_non_intrusives_present: - if self.new_format_non_insturives_present_v2: + if self.new_format_non_insturives_present_v2 or self.solar_non_intrusives_present: existing_solar_non_intrusives_check = ( self.standardised_asset_list["non-intrusives: ROOF ORIENTATION"].str.strip().isin( - ["ALREADY HAS SOLAR PV"] + ["ALREADY HAS SOLAR PV", "ALREADY HAS PV"] ) ) else: diff --git a/asset_list/app.py b/asset_list/app.py index 833050fb..20cf04f1 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -59,6 +59,176 @@ def app(): Property UPRN """ + # + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cambridge/" + data_filename = "22.10_Cambridge_west addresses.xlsx" + sheet_name = "Asset List" + postcode_column = 'Postcode' + address1_column = None + address1_method = "house_number_extraction" + fulladdress_column = "Full Address" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "id" + landlord_sap = None + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = None + + # Property Box + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box" + data_filename = "Property Box Finance Portfolio.xlsx" + sheet_name = "Sheet1" + postcode_column = 'Postcode' + address1_column = None + address1_method = "house_number_extraction" + fulladdress_column = "Address 1" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "row_id" + landlord_sap = None + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = "block_id" + + # CDS - able-to-pay + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/CDS/Able to pay" + data_filename = "CDS_ASSET LIST_(2314).xlsx" + sheet_name = "Sheet1" + postcode_column = 'Property Address - Postcode' + address1_column = "Property Address - Line 1" + address1_method = None + fulladdress_column = "Property Address - Line 1" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "row_id" + landlord_sap = None + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = None + + # Hyde - solar + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Solar" + data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx" + sheet_name = "Electric Property Inspections" + postcode_column = 'Postcode' + address1_column = None # Is only patchily populated so we create it + address1_method = 'house_number_extraction' + fulladdress_column = "Address" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_built_form = "Property Type" + landlord_wall_construction = "Walls " + landlord_roof_construction = "Roofs" + landlord_heating_system = "Heating" + landlord_existing_pv = None + landlord_property_id = "Address ID" + landlord_sap = "SAP" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = None + + # Hyde cavity + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hyde/Cavity" + data_filename = "Domna Property Analysis HYDE (Chichester Removed)V2-Completed.xlsx" + sheet_name = "Cavity Inspections" + postcode_column = 'Postcode' + address1_column = None # Is only patchily populated so we create it + address1_method = 'house_number_extraction' + fulladdress_column = "Address" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_built_form = "Property Type" + landlord_wall_construction = "Walls " + landlord_roof_construction = "Roofs" + landlord_heating_system = "Heating" + landlord_existing_pv = None + landlord_property_id = "Address ID" + landlord_sap = "SAP" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_id_colnames = [] + master_to_asset_list_filepath = None + phase = False + ecosurv_landlords = None + asset_list_header = 0 + landlord_block_reference = None + # CDS - Sept 2025 data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/CDS/September 2025 Programme" data_filename = "Founder Estates CDS.xlsx" diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index bdd82883..b02b8810 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -439,5 +439,23 @@ BUILT_FORM_MAPPINGS = { 'Chalet - Wheelchair': 'unknown', 'Studio Flat': 'unknown', 'Bungalow - Attached': 'semi-detached', - 'ND': 'unknown' + 'ND': 'unknown', + + 'Maisonette: Mid Terrace: Mid Floor': 'mid-floor', + 'Maisonette: Semi Detached: Ground Floor': 'semi-detached', + 'Maisonette: Enclosed Mid Terrace: Ground Floor': 'enclosed mid-terrace', + 'Maisonette: Enclosed End Terrace: Ground Floor': 'end-terrace', + 'Maisonette: Mid Terrace: Ground Floor': 'mid-terrace', + 'Flat: Semi Detached: Basement': 'semi-detached', + 'Maisonette: Semi Detached: Top Floor': 'semi-detached', + 'Maisonette: Enclosed Mid Terrace: Mid Floor': 'enclosed mid-terrace', + 'Flat: Detached: Basement': 'detached', + 'Maisonette: Enclosed Mid Terrace: Top Floor': 'enclosed mid-terrace', + + 'Maisonette: End Terrace: Top Floor': 'top-floor', + 'House: Mid Terrace: Ground Floor': 'ground floor', + 'Maisonette: Semi Detached: Mid Floor': 'detached', + 'Maisonette: Detached: Mid Floor': 'detached', + 'Bungalow: EnclosedMidTerrace': 'enclosed mid-terrace' + } diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py index 4ab8ca72..ffd1b198 100644 --- a/asset_list/mappings/heating_systems.py +++ b/asset_list/mappings/heating_systems.py @@ -477,6 +477,23 @@ HEATING_MAPPINGS = { 'Heat networks Heat networks (mains gas)': 'communal heating', 'ND Oil': 'oil fuel', - 'Boiler Biofuel': 'boiler - other fuel' + 'Boiler Biofuel': 'boiler - other fuel', + + 'Electric (direct acting) room heaters: Water- or oil-filled radiators': 'room heaters', + 'Other: Electric ceiling heating': 'electric ceiling', + 'Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C': 'air source heat pump', + 'Oil room heaters: Room heater, 2000 or later': 'room heaters', + 'Electric Underfloor Heating: In screed above insulation (standard or off peak)': 'electric underfloor', + 'Heat Pump: Electric Heat pumps: Air source heat pump in other cases': 'air source heat pump', + 'Electric Storage Systems: Old (large volume) storage heaters': 'electric storage heaters', + + 'Gas (including LPG) room heaters: Condensing gas fire': 'room heaters', + 'Solid fuel room heaters: Open fire in grate': 'solid fuel', + 'Solid fuel room heaters: Open fire with back boiler (no radiators)': 'solid fuel', + 'Community Heating Systems: Community heat pump (RdSAP)': 'communal heating', + 'Gas (including LPG) room heaters: Gas fire, open flue, 1980 or later (open fronted), sitting proud of, ' + 'and sealed to, fireplace opening': 'room heaters', + 'Boiler: A rated Regular Boiler, System 2: Boiler: C rated Regular Boiler': 'boiler - other fuel', + 'Boiler: G rated Combi': 'gas condensing combi' } diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py index 290e172a..88ec2934 100644 --- a/asset_list/mappings/property_type.py +++ b/asset_list/mappings/property_type.py @@ -343,5 +343,23 @@ PROPERTY_MAPPING = { 'bungalow': 'bungalow', 'flat': 'flat', 'FLA': 'flat', - 'HOU': 'house' + 'HOU': 'house', + + 'Maisonette: Mid Terrace: Mid Floor': 'maisonette', + 'Maisonette: Semi Detached: Ground Floor': 'maisonette', + 'Maisonette: Enclosed Mid Terrace: Ground Floor': 'maisonette', + 'Maisonette: Enclosed End Terrace: Ground Floor': 'maisonette', + 'Maisonette: Mid Terrace: Ground Floor': 'maisonette', + 'Flat: Semi Detached: Basement': 'flat', + 'Maisonette: Semi Detached: Top Floor': 'maisonette', + 'Maisonette: Enclosed Mid Terrace: Mid Floor': 'maisonette', + 'Flat: Detached: Basement': 'flat', + 'Maisonette: Enclosed Mid Terrace: Top Floor': 'maisonette', + + 'Maisonette: End Terrace: Top Floor': 'maisonette', + 'House: Mid Terrace: Ground Floor': 'house', + 'Bungalow: EnclosedMidTerrace': 'bungalow', + 'Maisonette: Semi Detached: Mid Floor': 'maisonette', + 'Maisonette: Detached: Mid Floor': 'maisonette' + } diff --git a/asset_list/mappings/roof.py b/asset_list/mappings/roof.py index 8ac926c0..0857b046 100644 --- a/asset_list/mappings/roof.py +++ b/asset_list/mappings/roof.py @@ -275,5 +275,30 @@ ROOF_CONSTRUCTION_MAPPINGS = { 'Pitched (vaulted ceiling) Non-joist': 'pitched unknown insulation', 'ND (inferred) ND (inferred)': 'unknown', 'Flat Non-joist': 'flat insulated', - 'Same dwelling above N/A': 'another dwelling above' + 'Same dwelling above N/A': 'another dwelling above', + + 'Flat: As Built, PitchedNormalLoftAccess: Unknown': 'flat unknown insulation', + 'PitchedNormalLoftAccess: Unknown, PitchedNormalNoLoftAccess: Unknown': 'pitched unknown insulation', + 'PitchedNormalLoftAccess: 400mm+': 'pitched insulated', + 'AnotherDwellingAbove: 150mm': 'another dwelling above', + 'Flat: 150mm': 'flat insulated', + 'AnotherDwellingAbove: 50mm': 'another dwelling above', + 'PitchedNormalNoLoftAccess: As Built': 'pitched no access to loft', + 'PitchedNormalLoftAccess: 250mm, PitchedWithSlopingCeiling: As Built': 'pitched insulated', + 'PitchedNormalLoftAccess: 200mm, PitchedWithSlopingCeiling: As Built': 'pitched insulated', + 'PitchedNormalLoftAccess: 350mm': 'pitched insulated', + 'PitchedNormalNoLoftAccess: 270mm': 'pitched no access to loft', + 'AnotherDwellingAbove: 100mm': 'another dwelling above', + + 'PitchedWithSlopingCeiling: Unknown': 'piched unknown insulation', + 'AnotherDwellingAbove: Unknown, Flat: As Built': 'another dwelling above', + 'Flat: Unknown, PitchedNormalLoftAccess: 25mm': 'flat unknown insulation', + 'SameDwellingAbove: Unknown': 'another dwelling above', + 'Flat: Unknown': 'flat unknown insulation', + 'Flat: 50mm, PitchedNormalLoftAccess: 100mm': 'flat insulated', + 'Flat: As Built, PitchedNormalLoftAccess: 250mm, PitchedWithSlopingCeiling: As Built': 'flat unknown insulation', + 'Flat: As Built, PitchedNormalLoftAccess: 400mm+': 'flat unknown insulation', + 'PitchedWithSlopingCeiling: As Built': 'pitched insulated', + 'PitchedNormalLoftAccess: As Built': 'pitched unknown insulation', + } diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py index 73db586e..418ae9f8 100644 --- a/asset_list/mappings/walls.py +++ b/asset_list/mappings/walls.py @@ -353,4 +353,7 @@ WALL_CONSTRUCTION_MAPPINGS = { 'System built As-built': "uninsulated system built", 'System built Internal': 'insulated system built', + 'Cavity: AsBuilt (1976-1982), TimberFrame: AsBuilt': 'cavity unknown insulation', + 'Cavity: FilledCavityPlusExternal': 'filled cavity' + } diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 16dd8f04..1a14e87a 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -347,7 +347,8 @@ class SearchEpc: # We update the data with the correct uprn if self.uprn: for x in api_response["response"]["rows"]: - x["uprn"] = self.uprn + if pd.isnull(x["uprn"]): + x["uprn"] = self.uprn data["rows"].extend(api_response["response"]["rows"]) @@ -357,6 +358,8 @@ class SearchEpc: row for row in data["rows"] if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) ] + # Overwrite the data + self.data = data if data["rows"]: api_response["msg"] = self.SUCCESS diff --git a/backend/engine/engine.py b/backend/engine/engine.py index fa1f191c..f2674290 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -145,14 +145,17 @@ def extract_portfolio_aggregation_data( cost = sum([r["total"] for r in default_recommendations]) sap_point_improvement = sum([r["sap_points"] for r in default_recommendations]) - lower_bound_valuation_uplift = ( - property_value_increase_ranges[p.id]["lower_bound_increased_value"] - - property_value_increase_ranges[p.id]["current_value"] - ) - upper_bound_valuation_uplift = ( - property_value_increase_ranges[p.id]["upper_bound_increased_value"] - - property_value_increase_ranges[p.id]["current_value"] - ) + if not pd.isnull(property_value_increase_ranges[p.id]["current_value"]): + lower_bound_valuation_uplift = ( + property_value_increase_ranges[p.id]["lower_bound_increased_value"] - + property_value_increase_ranges[p.id]["current_value"] + ) + upper_bound_valuation_uplift = ( + property_value_increase_ranges[p.id]["upper_bound_increased_value"] - + property_value_increase_ranges[p.id]["current_value"] + ) + else: + lower_bound_valuation_uplift, upper_bound_valuation_uplift = 0, 0 agg_data.append({ "pre_retrofit_epc": p.data["current-energy-rating"], @@ -523,6 +526,7 @@ async def model_engine(body: PlanTriggerRequest): plan_input["built_form"] = plan_input["built_form"].map(built_form_map) plan_input = plan_input.to_dict("records") + else: raise ValueError("Other formats not yet supported") @@ -549,6 +553,13 @@ async def model_engine(body: PlanTriggerRequest): # If we have patches or overrides, we should read them in here patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body) + if body.file_type == "xlsx" and body.file_format == "domna_asset_list": + # We check if we have valution data + if not valuation_data and body.valuation_file_path in [None, ""]: + # We check plan_input + if "domna_valuation" in plan_input[0]: + valuation_data = [{"uprn": x["uprn"], "valuation": x["domna_valuation"]} for x in plan_input] + cleaning_data = read_dataframe_from_s3_parquet( bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", ) @@ -563,12 +574,22 @@ async def model_engine(body: PlanTriggerRequest): if uprn: uprn = int(float(uprn)) + address1 = config.get("address", None) + # Handle domna address list format + if pd.isnull(address1) and body.file_format == "domna_asset_list": + address1 = config.get("domna_full_address", None) + + address1 = str(int(address1)) if isinstance(address1, float) else str(address1) + + full_address = config["domna_full_address"] if body.file_format == "domna_asset_list" else None + epc_searcher = SearchEpc( - address1=str(config["address"]), + address1=address1, postcode=config["postcode"], uprn=uprn, auth_token=get_settings().EPC_AUTH_TOKEN, os_api_key="", + full_address=full_address ) epc_searcher.ordnance_survey_client.built_form = config.get("built_form", None) epc_searcher.ordnance_survey_client.property_type = config.get("property_type", None) @@ -1176,9 +1197,10 @@ async def model_engine(body: PlanTriggerRequest): upload_funding(session, p, new_plan_id, recommendations_to_upload) - property_valuation_increases.append( - valuations["average_increased_value"] - valuations["current_value"] - ) + if valuations["current_value"] > 0: + property_valuation_increases.append( + valuations["average_increased_value"] - valuations["current_value"] + ) # Commit the session after each batch session.commit() diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index 8c57900f..17db0dae 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -219,12 +219,19 @@ class PropertyValuation: current_epc = property_instance.data["current-energy-rating"] if not current_value: + # In this case, we return a % improvement rather than an absolute + relative_improvement = cls.estimate_valuation_improvement( + current_value=1, + current_epc=current_epc, + target_epc=target_epc, + total_cost=1 + ) return { "current_value": 0, - "lower_bound_increased_value": 0, - "upper_bound_increased_value": 0, - "average_increased_value": 0, - "average_increase": 0 + "lower_bound_increased_value": relative_improvement["lower_bound_increased_value"] - 1, + "upper_bound_increased_value": relative_improvement["upper_bound_increased_value"] - 1, + "average_increased_value": relative_improvement["average_increased_value"] - 1, + "average_increase": relative_improvement["average_increase"] } return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost) diff --git a/etl/webscrape/Zoopla.py b/etl/webscrape/Zoopla.py index bb86c759..7b3fd5b6 100644 --- a/etl/webscrape/Zoopla.py +++ b/etl/webscrape/Zoopla.py @@ -1,38 +1,111 @@ -# Initial Code - -from seleniumbase import SB +from bs4 import BeautifulSoup +import pandas as pd import time +from stealth_requests import StealthSession +import random +from multiprocessing import Pool +from tqdm import tqdm -uprns = [ - 100071297618, - 100080893397, - 100060778033, - 200004793081, - 100071265143, - 100071297618, - 100080893397, - 100060778033, - 200004793081, - 100071265143, -] +ENGINES = ["safari", "chrome"] -estimate_list = [] -for uprn in uprns: +def scrape_all_estimates(session, url): + # Rotate impersonation per request + resp = session.get(url, impersonate=ENGINES[random.randint(0, 1)]) + page_source = BeautifulSoup(resp.text, "html.parser") + estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) + is_blocked = len(estimates) == 0 + return estimates, is_blocked - # Probably can change the timings here - time.sleep(5) - with SB(uc=True) as sb: - sb.uc_open_with_reconnect( - f"https://www.zoopla.co.uk/property/uprn/{uprn}/", - 3, + +def parallel_task(url): + # No impersonate argument here + with StealthSession() as session: + estimates, is_blocked = scrape_all_estimates(session, url) + + while is_blocked: + print(f"Blocked by Zoopla for URL: {url}") + time.sleep(random.uniform(0, 1)) + estimates, is_blocked = scrape_all_estimates(session, url) + + low_estimate = estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text + middle_estimate = estimates[0].find("p", {"data-testid": "estimate-blurred"}).text + high_estimate = estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text + + return { + "URL": url, + "Low Estimate": low_estimate, + "Middle Estimate": middle_estimate, + "High Estimate": high_estimate, + } + + +def parse_price(p): + p = p.replace("£", "").strip().lower() + if p.endswith("k"): + return float(p[:-1]) * 1000 + elif p.endswith("m"): + return float(p[:-1]) * 1_000_000 + else: + return float(p) + + +# def parallel_task(url): +# with StealthSession(impersonate=ENGINES[random.randint(0, 1)]) as session: +# estimates, is_blocked = scrape_all_estimates(session, url) +# +# while is_blocked: +# # Will need to wait and retry if blocked by Zoopla +# print(f"Blocked by Zoopla for URL: {url}") +# sleep_factor = random.uniform(0, 1) # Random delay to avoid detection +# time.sleep(sleep_factor * 1) +# estimates, is_blocked = scrape_all_estimates(session, url) +# +# low_estimate = ( +# estimates[0].find("span", {"data-testid": "low-estimate-blurred"}).text +# ) # Find all span elements with data-testid="low-estimate" +# middle_estimate = ( +# estimates[0].find("p", {"data-testid": "estimate-blurred"}).text +# ) # Find all span elements with data-testid="middle-estimate" +# high_estimate = ( +# estimates[0].find("span", {"data-testid": "high-estimate-blurred"}).text +# ) # Find all span elements with data-testid="high-estimate-blurred" +# +# return { +# "URL": url, +# "Low Estimate": low_estimate, +# "Middle Estimate": middle_estimate, +# "High Estimate": high_estimate, +# } + + +if __name__ == "__main__": + # Get a SAL + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NRLA/Property Box/Property Box Finance Portfolio - " + "Standardised.xlsx", + sheet_name="Standardised Asset List" + ) + asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str) + uprns = asset_list["epc_os_uprn"].tolist() + urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns] + + with Pool(processes=5) as pool: + estimates_list = list( + tqdm( + pool.imap(parallel_task, urls), + total=len(urls), + ) ) - soup = sb.get_beautiful_soup() + df = pd.DataFrame(estimates_list) + # Extract UPRN from URL + df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/") + df["valuation"] = df["Middle Estimate"].apply(parse_price) + df.to_csv("zoopla_estimates.csv", index=False) - estimates = soup.find_all("div", {"data-testid": "sale-estimate"}) - # Can change the way we extract the text here - estimate_text = ( - estimates[-1].find_all("p")[-1].find_all("span")[-1]["aria-label"] - ) - estimate_list.append(estimate_text) + df["uprn"] = df["uprn"].astype(int).astype(str) + + asset_list.merge(df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left").to_excel( + "Property Box Finance Portfolio - Standardised - with valuations.xlsx", index=False + ) diff --git a/etl/webscrape/requirements.txt b/etl/webscrape/requirements.txt new file mode 100644 index 00000000..4027a224 --- /dev/null +++ b/etl/webscrape/requirements.txt @@ -0,0 +1,5 @@ +beautifulsoup4>=4.12.0 +pandas>=2.0.0 +stealth-requests>=1.0.7 +tqdm>=4.65.0 +openpyxl \ No newline at end of file