From ba0d5e147396693603e3b281e19cb8c9ebb5bf1f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 25 Jun 2025 20:43:40 +0100 Subject: [PATCH] Data pulled for cavity abs estimates --- asset_list/abs_estimates.py | 145 ++++++++++++++++++++++++ etl/find_my_epc/AssetListEpcData.py | 34 ++++-- etl/find_my_epc/RetrieveFindMyEpc.py | 13 ++- sfr/principal_pitch/0_prepare_sample.py | 2 +- 4 files changed, 183 insertions(+), 11 deletions(-) create mode 100644 asset_list/abs_estimates.py diff --git a/asset_list/abs_estimates.py b/asset_list/abs_estimates.py new file mode 100644 index 00000000..ee85973c --- /dev/null +++ b/asset_list/abs_estimates.py @@ -0,0 +1,145 @@ +""" +Simple script to take a standardised asset list and calculate the abs. We'll use this code to estimate +the ABS for properties, going forward +""" +import os +import pandas as pd +import numpy as np +from dotenv import load_dotenv +from etl.find_my_epc.AssetListEpcData import AssetListEpcData +from backend.Funding import Funding +from backend.app.utils import sap_to_epc + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/Thrive Programme - reconciled.xlsx", + sheet_name="Cavity properties - for review" +) + +abs_matrix = pd.read_csv( + "/Users/khalimconn-kowlessar/Downloads/ECO4 Full Project Scores Matrix.csv" +) +pps_matrix = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/ECO4 Partial Project Scores Matrix v5.xlsx", + header=1 +) +pps_matrix.columns = [c.strip() for c in pps_matrix.columns] + +# We need to estimate the number of points the work will produce and the finishing band. For this, we assume 7 for +# cavity and 15 for solar. We'll be more specific in the future, but for now, this is a good enough estimate. +cavity_route = asset_list[["domna_address_1", "domna_postcode", "epc_os_uprn"]].rename( + columns={"domna_address_1": "address", "domna_postcode": "postcode", "epc_os_uprn": "upr"} +) +cavity_route["address"] = cavity_route["address"].astype(str) + +asset_list_epc_client = AssetListEpcData( + asset_list=cavity_route, + epc_auth_token=EPC_AUTH_TOKEN +) + +asset_list_epc_client.get_data() +asset_list_epc_client.get_non_invasive_recommendations() + +cwi_sap_points = [] +for r in asset_list_epc_client.non_invasive_recommendations: + if not r.get("recommendations"): + continue + cwi_recommendations = [ + x for x in r["recommendations"] if "cavity_wall_insulation" in x["type"] + ] + if cwi_recommendations: + cwi_recommendations = cwi_recommendations[0] + else: + continue + + address = r["address"] + postcode = r["postcode"] + + cwi_sap_points.append( + { + "address": address, + "postcode": postcode, + "sap_points": cwi_recommendations["sap_points"] + } + ) + +cwi_sap_points = pd.DataFrame(cwi_sap_points) +# Store the sap points in the cavity route to csv +# cwi_sap_points.to_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/cwi_sap_points_livewest_sw.csv", +# index=False +# ) +# cwi_sap_points = pd.read_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/cwi_sap_points_livewest_sw.csv" +# ) +avg_cwi_points_by_postcode = cwi_sap_points.groupby(["postcode"]).agg({"sap_points": "mean"}).reset_index() +avg_cwi_points = cwi_sap_points["sap_points"].median() +asset_list = asset_list.merge( + cwi_sap_points, how="left", left_on=["domna_address_1", "domna_postcode"], right_on=["address", "postcode"] +).drop( + columns=["address", "postcode"] +) + +# Fill the sap points with the average cwi points +asset_list = asset_list.merge( + avg_cwi_points_by_postcode.rename(columns={"postcode": "domna_postcode"}), + how="left", on=["domna_postcode"], suffixes=("", "_avg") +) +asset_list["sap_points"] = asset_list["sap_points"].fillna(asset_list["sap_points_avg"]) +asset_list.drop(columns=["sap_points_avg"], inplace=True) + +asset_list["sap_points"] = asset_list["sap_points"].fillna(avg_cwi_points) +asset_list["post_works_sap"] = asset_list["epc_sap_score_on_register"] + asset_list["sap_points"] +asset_list["post_works_epc"] = asset_list["post_works_sap"].apply(lambda x: sap_to_epc(x)) +asset_list["starting_half_band"] = asset_list["epc_sap_score_on_register"].apply(lambda x: Funding.get_sap_band(x)) +asset_list["ending_half_band"] = asset_list["post_works_sap"].apply(lambda x: Funding.get_sap_band(x)) +asset_list["floor_area_band"] = asset_list["epc_total_floor_area"].apply(lambda x: Funding.get_floor_area_band(x)) + +asset_list["funding_scheme"] = np.where( + ( + (asset_list["post_works_epc"] == asset_list["epc_rating_on_register"]) + ), + "GBIS", + "ECO4" +) +asset_list = asset_list.merge( + abs_matrix, how="left", left_on=["starting_half_band", "ending_half_band", "floor_area_band"], + right_on=['Starting Band', 'Finishing Band', 'Floor Area Segment', ] +) +asset_list = asset_list.drop(columns=['Starting Band', 'Finishing Band', 'Floor Area Segment']) + +# Using CWI solid 1.7 -> 0.3 rates +cwi_pps_matrix = pps_matrix[ + pps_matrix["Measure_Type"].isin(["CWI_0.033"]) +] +# Merge on +asset_list = asset_list.merge( + cwi_pps_matrix[['Starting Band', 'Total Floor Area Band', 'Cost Savings']].rename( + columns={ + "Cost Savings": "partial_project_score", + "Starting Band": "starting_half_band", + "Total Floor Area Band": "floor_area_band" + } + ), + how="left", + on=["starting_half_band", "floor_area_band"], +) +asset_list["partial_project_score"] = np.where( + (asset_list["epc_sap_score_on_register"] > 69), + None, + asset_list["partial_project_score"] +) + +asset_list["funding_abs"] = np.where( + asset_list["funding_scheme"] == "GBIS", + asset_list["partial_project_score"], + asset_list["Cost Savings"] +) + +# Store this data +asset_list.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Instagroup Review/thrive_abs_estimates.csv", + index=False +) diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py index 9845ee3b..2ff9a3e0 100644 --- a/etl/find_my_epc/AssetListEpcData.py +++ b/etl/find_my_epc/AssetListEpcData.py @@ -1,3 +1,4 @@ +import random import time import pandas as pd from tqdm import tqdm @@ -50,7 +51,7 @@ class AssetListEpcData: "uprn": r.get("uprn"), "address": r["address"], "postcode": r["postcode"], - "recommendations": r["recommendations"] + "recommendations": r.get("recommendations") } for r in self.extracted_data ] @@ -106,12 +107,31 @@ class AssetListEpcData: logger.error(f"Error retrieving find my epc data: {e}") if not pd.isnull(home.get("patch")): epc_searcher.newest_epc["address1"] = add1 - find_epc_searcher = RetrieveFindMyEpc( - address=epc_searcher.newest_epc["address1"], - postcode=epc_searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - time.sleep(0.5) + + try: + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except Exception as e: + logger.error("Error retrieving find my epc data with alternative address format: {e}") + find_epc_data = { + "current_epc_rating": epc_searcher.newest_epc["current-energy-rating"], + "current_epc_efficiency": epc_searcher.newest_epc["current-energy-efficiency"], + "potential_epc_rating": None, + "potential_epc_efficiency": None, + "epc_data": {} + } + + # Sleep for a random amount of time between 0.5 and 1 seconds to avoid hitting the API rate limit + time.sleep(random.sample(range(50, 100), 1)[0] / 100) + + # Every 50 requests, we sleep for 10 seconds to avoid hitting the API rate limit + if len(extracted_data) % 50 == 0 and len(extracted_data) > 0: + logger.info("Sleeping for 10 seconds to avoid hitting API rate limit") + time.sleep(10) + # We need uprn to_append = { diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index d4092fe7..3fd7918f 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -56,9 +56,11 @@ class RetrieveFindMyEpc: results = {} # 1. Total floor area - results['total-floor-area'] = int(self.get_text( + # We have some isntances of very old EPCs where the total floor area is not available + tfa = self.get_text( soup.find("dt", string="Total floor area").find_next_sibling("dd") - ).split(" ")[0]) + ).split(" ")[0] + results['total-floor-area'] = int(tfa) if tfa != "Not" else None # Table with features rows = soup.select("table.govuk-table tbody tr") @@ -387,7 +389,9 @@ class RetrieveFindMyEpc: extracted_address = address_tag.text.strip() extracted_address_url = address_tag['href'] - extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower() + extracted_address_cleaned = ( + extracted_address.replace(",", "").replace(" ", "").lower() + ) if not extracted_address_cleaned.startswith(self.address_cleaned): continue @@ -667,6 +671,9 @@ class RetrieveFindMyEpc: "Condensing boiler (separate from the range cooker)": ["boiler_upgrade"], "Heating controls (programmer and thermostatic radiator valves)": [ "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + 'Heating controls (programmer room thermostat and thermostatic radiator valves)': [ + "roomstat_programmer_trvs", "time_temperature_zone_control" ] } diff --git a/sfr/principal_pitch/0_prepare_sample.py b/sfr/principal_pitch/0_prepare_sample.py index bcab16b9..8150d519 100644 --- a/sfr/principal_pitch/0_prepare_sample.py +++ b/sfr/principal_pitch/0_prepare_sample.py @@ -21,7 +21,7 @@ birmingham_epcs = birmingham_epcs.sort_values( birmingham_epcs["postal_region"] = birmingham_epcs["POSTCODE"].str.split(" ").str[0] addressable_market = birmingham_epcs[ - (birmingham_epcs['CURRENT_ENERGY_RATING'].isin(['F', 'G', 'E'])) & + (birmingham_epcs['CURRENT_ENERGY_RATING'].isin(['F', 'G', 'E', 'D'])) & (birmingham_epcs['LODGEMENT_DATE'] >= '2020-01-01') & (birmingham_epcs['PROPERTY_TYPE'].isin(['House', 'Bungalow'])) & (birmingham_epcs['TENURE'].isin(