From 95ff513f801327c8af3f72b33ef9a00ac39ec96d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 23 Jul 2024 13:13:26 +0100 Subject: [PATCH] stonewater revision wip --- etl/bill_savings/data_collection.py | 2 +- etl/customers/stonewater/shdf_3_clustering.py | 299 +++++++++++++++++- etl/sfr/epc_average_by_postcode.py | 77 +++++ 3 files changed, 366 insertions(+), 12 deletions(-) create mode 100644 etl/sfr/epc_average_by_postcode.py diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index 35498037..f2a1a5c6 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -133,7 +133,7 @@ def app(): energy_consumption_data = [] for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)): # Skip the first 50 - if i < 127: + if i < 245: continue data = pd.read_csv(directory / "certificates.csv", low_memory=False) diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index bdac5ec2..ba2fcc39 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -1429,17 +1429,17 @@ def compile_data_final(): older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs # Store in S3 # TODO - read in instead of running - save_pickle_to_s3( - data=epc_data_batch_2, - s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", - bucket_name="retrofit-data-dev" - ) - - save_pickle_to_s3( - data=older_epcs_batch_2, - s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl", - bucket_name="retrofit-data-dev" - ) + # save_pickle_to_s3( + # data=epc_data_batch_2, + # s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", + # bucket_name="retrofit-data-dev" + # ) + # + # save_pickle_to_s3( + # data=older_epcs_batch_2, + # s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl", + # bucket_name="retrofit-data-dev" + # ) epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) complete_epcs = pd.concat([epc_data, epc_data_batch_2]) @@ -1799,6 +1799,10 @@ def compile_data_final(): 'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above' ] + additional_features = [ + + ] + # Define the preprocessing for numerical and categorical features numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist() categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist() @@ -1957,3 +1961,276 @@ def pull_ideal_postcodes(missing_uprn_with_udprn): result["result"] ) completed_id += 1 + + +def updated_version(): + """ + This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process + using fewer variables and also factoring in their internal data sources + + This work began on the 23rd July 2024 + :return: + """ + + ######################################################################## + # Read in data + ######################################################################## + asset_list = read_asset_list() + asset_list = merge_uprn_to_asset_list(asset_list) + + # Read in the properties that have been included in Osmosis' wave 2.1 + osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1() + + asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids) + + # We also check the address & postcode + asset_list["In Osmosis Wave 2.1"] = np.where( + asset_list["address1"].isin(osmosis_wave_2_1["Name"]), + True, + asset_list["In Osmosis Wave 2.1"] + ) + + priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data() + + # Filter the asset list down to the priority postcodes + asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes) + + master_sheet = master_sheet[ + master_sheet["Address ID"].isin( + asset_list["external_address_id"].values + ) + ] + + master_sheet["days_since_lodgement"] = ( + datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True) + ).dt.days + + asset_list = asset_list.drop(columns=["Lodgement Date"]).merge( + master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]], + how="left", + left_on="external_address_id", + right_on="Address ID" + ) + + # Flag properties that were surveyed within the last 5 years + asset_list["epc_within_5_years"] = asset_list["days_since_lodgement"] < 5 * 365 + + # Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already + # a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when + # the EPC is done + asset_list["is_epc_c_or_above"] = ( + ((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) | + (asset_list["EPC Rating"] >= 80) + ) + + clustering_features = asset_list[ + asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] + ][ + [ + "internal_id", "customer_asset_id", "postcode", "house_number", "address1", "address2", "city_town", + "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date", "epc_within_5_years", + "EPC Rating" + ] + ] + + # Merge on the SAP data + clustering_features = clustering_features.merge( + master_sheet[ + ["Address ID", "SAP"] + ].rename(columns={"SAP": "parity_modelled_sap"}), + how="left", + left_on="external_address_id", + right_on="Address ID" + ) + + +def read_asset_list(): + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", + header=4 + ) + + udprn_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0 + )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"}) + udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str) + udprn_data["Address ID"] = udprn_data["Address ID"].astype(str) + + asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID") + asset_list = asset_list.rename(columns={"UDPRN": "udprn"}) + + asset_list = asset_list.rename( + columns={ + "Osm. ID": "internal_id", + "Org. ref.": "customer_asset_id", + "Postcode": "postcode", + "House no": "house_number", + "Name": "address1", + "Address line 2": "address2", + "City/Town": "city_town", + "County": "county", + "Address ID": "external_address_id", + "Owning body": "owner" + } + ) + + asset_list["full_address"] = np.where( + ~pd.isnull(asset_list["address2"]), + ( + asset_list["address1"] + ", " + + asset_list["address2"] + ", " + + asset_list["city_town"].str.title() + ", " + + asset_list["postcode"] + ), + asset_list["address1"] + ", " + + asset_list["city_town"].str.title() + ", " + + asset_list["postcode"] + ) + return asset_list + + +def merge_uprn_to_asset_list(asset_list): + # Read in the lookups + uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json" + ))) + uprn_lookup_1["match_type"] = "Exact" + + uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json" + ))) + uprn_lookup_2 = uprn_lookup_2.rename( + columns={ + "epc_address": "standardised_address", + "epc_postcode": "standardised_postcode" + } + ) + uprn_lookup_2["match_type"] = "EPC" + uprn_lookup_2["uprn"] = np.where( + uprn_lookup_2["internal_id"] == 1091, + 83143766, + uprn_lookup_2["uprn"] + ) + + uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json" + ))) + uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply( + concatenate_row, axis=1 + ) + uprn_lookup_3 = uprn_lookup_3[ + ["udprn", "uprn", "standardised_address", "postcode"] + ].rename(columns={"postcode": "standardised_postcode"}) + uprn_lookup_3["match_type"] = "Exact" + + uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False) + uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str) + uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str) + # prepare lookup 4 + uprn_lookup_4 = [] + for _, x in uprn_lookup_4_basis.iterrows(): + + property_type = None + built_form = None + if x["option"] == 1: + uprn = x["os_option_1_uprn"] + standardised_address = x["os_option_1_address"] + postcode = x["os_option_1_postcode"] + elif x["option"] == 2: + uprn = x["os_option_2_uprn"] + standardised_address = x["os_option_2_address"] + postcode = x["os_option_2_address"].split(", ")[-1] + else: + uprn = x["manual_uprn"] + standardised_address = x["manual_address"] + postcode = x["manual_postcode"] + + uprn_lookup_4.append( + { + "internal_id": x["internal_id"], + "external_address_id": x["external_address_id"], + "uprn": uprn, + "standardised_address": standardised_address, + "standardised_postcode": postcode, + "property_type": property_type, + "built_form": built_form + } + ) + uprn_lookup_4 = pd.DataFrame(uprn_lookup_4) + uprn_lookup_4["match_type"] = "Fuzzy" + + # concat + uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2]) + + assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list) + + # Final preps of lookups + uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str) + uprn_lookup_3 = uprn_lookup_3.merge( + asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn" + ) + uprn_lookup = pd.concat([ + uprn_lookup, + uprn_lookup_3, + uprn_lookup_4 + ]) + uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str) + + asset_list = asset_list.merge( + uprn_lookup.drop(columns=["udprn"]), + how="inner", + on=["internal_id", "external_address_id"] + ) + + return asset_list + + +def read_omosis_wave_2_1(): + osmosis_wave_2_1 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx", + header=4, + ) + # Remove double spaces from "Name" + osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace(" ", " ") + + osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"}) + osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"] + # We produce a cleaned list of asset ids from osmosis_wave_2_1 + osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)] + # We have some ids that are in the form 'id1, id2' so we split them + osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")] + + return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 + + +def read_stonewater_asset_data(): + master_sheet = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master " + "sheet.csv", + encoding='latin1' + ) + + master_sheet["Address ID"] = master_sheet["Address ID"].astype(str) + + previous_waves = master_sheet[ + (master_sheet["In Osmosis W2.1"] == "Yes") | + (master_sheet["In Wates Wave 2.1"] == "Yes") | + (master_sheet["In Liv Green Wave 2.1"] == "Yes") | + (master_sheet["In CCS Wave 2.1"] == "Yes") + ].copy() + + previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)] + + # We also read the priority postcodes + priority_postcodes = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority " + "postcodes.csv", + header=17 + ) + + priority_postcodes = priority_postcodes["Postcode"].tolist() + + return priority_postcodes, previous_waves_address_id, master_sheet diff --git a/etl/sfr/epc_average_by_postcode.py b/etl/sfr/epc_average_by_postcode.py new file mode 100644 index 00000000..93683000 --- /dev/null +++ b/etl/sfr/epc_average_by_postcode.py @@ -0,0 +1,77 @@ +import os +from tqdm import tqdm +import pandas as pd +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from backend.app.utils import sap_to_epc + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def app(): + """ + This script will retrieve EPC data, for postcodes and produce statistics on the SAP Score + :return: + """ + + source_file = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Addresses - SFR rents.xlsx") + source_file["row_id"] = source_file.index + # Split out the town, which is the final portion of the string, separated by commas + source_file["Town"] = source_file["Address"].apply(lambda x: x.split(" ")[-1].strip() if not pd.isnull(x) else None) + source_file["Address"] = source_file["Address"].apply( + lambda x: " ".join(x.split(" ")[:-1]).strip() if not pd.isnull(x) else None + ) + + unique_postcodes = source_file[["Address", "Postcode"]].drop_duplicates() + + # for each postcode, pull EPC data + collected_data = [] + no_data_found = [] + no_data_after_filters = [] + for _, config in tqdm(unique_postcodes.iterrows(), total=len(unique_postcodes)): + address1 = config["Address"] if not pd.isnull(config["Address"]) else "" + searcher = SearchEpc( + postcode=config["Postcode"], + address1=address1, + auth_token=EPC_AUTH_TOKEN, + os_api_key="" + ) + while True: + params = { + "postcode": config["Postcode"], + "address": address1, + } + results = searcher.client.domestic.search(params=params, size=10000) + if not results: + # We strip back address1 + address1 = " ".join(address1.split(" ")[:-1]) + if not address1: + break + else: + break + + if not results: + no_data_found.append(config) + continue + + data = pd.DataFrame(results["rows"]) + + data["current-energy-efficiency"] = data["current-energy-efficiency"].astype(int) + # Take EPCs post 2023 + data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], errors="coerce") + data = data[data["lodgement-date"] >= "2023-01-01"] + # Take private nrentals + data = data[data["tenure"].isin(["rental (private)", "Rented (private)"])] + + if data.empty: + no_data_after_filters.append(config) + continue + + agg = data.groupby(["property-type", "built-form"])["current-energy-efficiency"].mean().reset_index() + agg = agg.rename(columns={"current-energy-efficiency": "Average SAP"}) + agg["Average EPC"] = agg["Average SAP"].apply(sap_to_epc) + agg.insert(0, "Postcode", config["Postcode"]) + agg.insert(0, "Address", address1) + + collected_data.append(agg)