diff --git a/.idea/Model.iml b/.idea/Model.iml index b0f9c00d..4413bb06 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 1122b380..6f308057 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/goldman/property_ownership.py b/etl/customers/goldman/property_ownership.py index d30205ae..44fa7142 100644 --- a/etl/customers/goldman/property_ownership.py +++ b/etl/customers/goldman/property_ownership.py @@ -3,6 +3,7 @@ import pandas as pd from tqdm import tqdm import Levenshtein from backend.SearchEpc import SearchEpc +from utils.s3 import read_dataframe_from_s3_parquet # Average value of a property in the midlands in 2024 was £238,000. Since these are EPC F & G properties, we assume # £207,000 since they trade at a discount. This is based on the rightmove study where moving from an EPC F/G -> C has a @@ -248,6 +249,13 @@ def app(): """ This script is for scoping property ownership for EPC F & G rated properties in Birmingam, for Goldman Sachs """ + + # TODO: This property: + # https://epc.opendatacommunities.org/domestic/search?address=&postcode=&local-authority=&constituency + # =&uprn=100031179243&from-month=1&from-year=2008&to-month=12&to-year=2024 + # is actually listed in two local authorities causing us to think it's an EPC F & G property, but it's + # it's actually EPC E. Need to handle this, probably by reading in all of the EPC data, concatenating together + # and performing a singular filter for most recent EPC by UPRN # paths = [ # "local_data/all-domestic-certificates/domestic-E08000025-Birmingham/certificates.csv", # "local_data/all-domestic-certificates/domestic-E08000031-Wolverhampton/certificates.csv", @@ -477,6 +485,35 @@ def app(): portfolio_epc_data_50m.to_excel("portfolio_epc_data_50m 28th May.xlsx", index=False) portfolio_epc_data_20m.to_excel("portfolio_epc_data_20m 28th May.xlsx", index=False) + # We check if any of these properties are in a conservation area + valuations = pd.read_excel("property value.xlsx") + + uprn_filenames = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet" + ) + + geospatial_data = [] + for _, row in tqdm(valuations.iterrows(), total=len(valuations)): + filtered_df = uprn_filenames[ + (uprn_filenames["lower"] <= row["UPRN"]) + & (uprn_filenames["upper"] >= row["UPRN"]) + ] + if filtered_df.empty: + raise Exception("No match found") + + filename = filtered_df.iloc[0]["filenames"] + + spatial_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}" + ) + spatial = spatial_data[ + spatial_data["UPRN"] == row["UPRN"] + ][["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]] + geospatial_data.append(spatial.to_dict("records")[0]) + + geospatial_data = pd.DataFrame(geospatial_data) + geospatial_data.to_excel("geospatial_data.xlsx", index=False) + def company_aggregation(): company_ownership = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_04.csv") diff --git a/etl/property_valuation/requirements.txt b/etl/property_valuation/requirements.txt new file mode 100644 index 00000000..8a4a1924 --- /dev/null +++ b/etl/property_valuation/requirements.txt @@ -0,0 +1,7 @@ +seleniumbase +beautifulsoup4 +requests +pandas +tqdm +openpyxl +undetected_chromedriver \ No newline at end of file diff --git a/etl/property_valuation/scrape_valuations.py b/etl/property_valuation/scrape_valuations.py new file mode 100644 index 00000000..67713a4e --- /dev/null +++ b/etl/property_valuation/scrape_valuations.py @@ -0,0 +1,83 @@ +import requests +import random +import time +import pandas as pd +from bs4 import BeautifulSoup +from tqdm import tqdm +from seleniumbase import Driver +from seleniumbase import page_actions + +import undetected_chromedriver as webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +import time +import pandas as pd + +BASE_URL = "https://www.zoopla.co.uk/property/uprn/{uprn}/" + + +def initialize_driver(): + driver = Driver(headless=True, uc=True) # Set headless to True if you want headless mode + return driver + + +def app(): + # Read in the starting asset list + asset_list = pd.read_excel("portfolio_epc_data_50m 28th May.xlsx") + asset_list = asset_list[["UPRN", "ADDRESS", "POSTCODE"]] + + # asset_list.to_excel("property value.xlsx", index=False) + + # Generate the list of urls + urls = [BASE_URL.format(uprn=uprn) for uprn in asset_list["UPRN"]] + + driver = webdriver.Chrome() + + driver = initialize_driver() + driver.set_page_load_timeout(30) # Increase page load timeout + + result = [] + for i, (url, uprn) in tqdm(enumerate(zip(urls, asset_list["UPRN"].tolist())), total=len(urls)): + + # Every 10 requests sleep for an extra 7 seconds + if len(result) % 10 == 0 and len(result) != 0: + time.sleep(7) + + try: + + driver.get(url) + page_actions.wait_for_element_visible(driver, "p[data-testid='estimate-blurred']", timeout=30) + + price_element = driver.find_element("css selector", "p[data-testid='estimate-blurred']") + price = price_element.get_text(strip=True) + + low_price_element = driver.find_element("css selector", "span[data-testid='low-estimate-blurred']") + low_price = low_price_element.get_text(strip=True) + + high_price_element = driver.find_element("css selector", "span[data-testid='high-estimate-blurred']") + high_price = high_price_element.get_text(strip=True) + + result.append( + { + "UPRN": uprn, + "price": price, + "lower_estimate": low_price, + "upper_estimate": high_price + } + ) + + # Sleep a random amount of time between 5 and 20 seconds + sleep_time = 5 + (15 * random.random()) + time.sleep(sleep_time) + + except Exception as e: + print(f"Failed to retrieve data for UPRN {uprn} at iteration {i}: {e}") + + # Store the result depending on where we are + savepoint = pd.DataFrame(result) + savepoint.to_csv(f"savepoint_index_{i}.csv", index=False) + + +if __name__ == "__main__": + app() diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index 243a5edb..939bef80 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -55,23 +55,24 @@ class WallRecommendations(Definitions): NEW_BUILD_INSULATED = 0.75 # These are the ending descriptions we consider for walls with external insulation + # This maps the clean descriptions to the ending descriptions EXTERNALLY_INSULATED_WALL_DESCRIPTIONS = { - "solid_brick": "Solid brick, with external insulation", - "cob": "Cob, with external insulation", - "system_built": "System built, with external insulation", - "granite_or_whinstone": 'Granite or whinstone, with external insulation', - "sandstone_or_limestone": 'Sandstone or limestone, with external insulation', - "timber_frame": "Timber frame, with external insulation" + "Cavity wall, as built, insulated": "Cavity wall, filled cavity and external insulation", + "Solid brick, as built, no insulation": "Solid brick, with external insulation", + "Cob, as built": "Cob, with external insulation", + "System built, as built, no insulation": "System built, with external insulation", + "Granite or whinstone, as built, no insulation": 'Granite or whinstone, with external insulation', + "Timber frame, as built, no insulation": "Timber frame, with external insulation", } # These are the ending descriptions we consider for walls with internal insulation INTERNALLY_INSULATED_WALL_DESCRIPTIONS = { - "solid_brick": "Solid brick, with internal insulation", - "cob": "Cob, with internal insulation", - "system_built": "System built, with internal insulation", - "granite_or_whinstone": 'Granite or whinstone, with internal insulation', - "sandstone_or_limestone": 'Sandstone or limestone, with internal insulation', - "timber_frame": "Timber frame, with internal insulation" + "Cavity wall, as built, insulated": "Cavity wall, filled cavity and internal insulation", + "Solid brick, as built, no insulation": "Solid brick, with internal insulation", + "Cob, as built": "Cob, with internal insulation", + "System built, as built, no insulation": "System built, with internal insulation", + "Granite or whinstone, as built, no insulation": 'Granite or whinstone, with internal insulation', + "Timber frame, as built, no insulation": "Timber frame, with internal insulation", } def __init__( @@ -302,17 +303,14 @@ class WallRecommendations(Definitions): wall_ending_config = WallAttributes("Cavity wall, filled cavity").process() - simulation_config = {} - if self.property.data["walls-energy-eff"] not in ["Good", "Very Good"]: - simulation_config = { - "walls_energy_eff_ending": "Good", - "walls_thermal_transmittance_ending": new_u_value - } - walls_simulation_config = check_simulation_difference( new_config=wall_ending_config, old_config=self.property.walls, prefix="walls_" ) + simulation_config = self.set_starting_simulation_config( + wall_ending_config=wall_ending_config + ) + simulation_config = {**simulation_config, **walls_simulation_config} recommendations.append( @@ -340,30 +338,35 @@ class WallRecommendations(Definitions): self.recommendations = recommendations def get_internal_external_wall_description(self, description_map, new_u_value): - if self.property.walls["is_solid_brick"]: - return description_map["solid_brick"] - - if self.property.walls["is_cob"]: - return description_map["cob"] - - if self.property.walls["is_system_built"]: - return description_map["system_built"] - - if self.property.walls["is_granite_or_whinstone"]: - return description_map["granite_or_whinstone"] - - if self.property.walls["is_sandstone_or_limestone"]: - return description_map["sandstone_or_limestone"] - - if self.property.walls["is_timber_frame"]: - return description_map["timber_frame"] if "Average thermal transmittance" in self.property.walls["clean_description"]: if new_u_value is None: raise ValueError("New u value is None") return f'Average thermal transmittance {new_u_value} W/m-¦K' - raise NotImplementedError("Not implemented yet") + return description_map[self.property.walls["clean_description"]] + + def set_starting_simulation_config(self, wall_ending_config): + """ + Helper function to set the starting simulation config + """ + + simulation_config = {} + if self.property.data["walls-energy-eff"] not in ["Good", "Very Good"]: + simulation_config = { + "walls_energy_eff_ending": "Good" + } + + # We check if we have double insulation in any instances + double_insulation = ( + (wall_ending_config["is_filled_cavity"] and wall_ending_config["external_insulation"]) or + (wall_ending_config["is_filled_cavity"] and wall_ending_config["internal_insulation"]) or + (wall_ending_config["external_insulation"] and wall_ending_config["internal_insulation"]) + ) + if double_insulation: + simulation_config["walls_energy_eff_ending"] = "Very Good" + + return simulation_config def _find_insulation(self, u_value, insulation_materials, non_insulation_materials, phase): @@ -425,16 +428,14 @@ class WallRecommendations(Definitions): wall_ending_config = WallAttributes(new_description).process() - simulation_config = {} - if self.property.data["walls-energy-eff"] not in ["Good", "Very Good"]: - simulation_config = { - "walls_energy_eff_ending": "Good" - } - walls_simulation_config = check_simulation_difference( new_config=wall_ending_config, old_config=self.property.walls, prefix="walls_" ) + simulation_config = self.set_starting_simulation_config( + wall_ending_config=wall_ending_config + ) + simulation_config = { **walls_simulation_config, **simulation_config,