From 0df3394c6c6efb61845140e3868c7b60f7fbeb36 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 23 Jul 2024 13:18:21 +0100 Subject: [PATCH] Adding epc data --- etl/customers/stonewater/shdf_3_clustering.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index ba2fcc39..0704d64d 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -13,7 +13,7 @@ import numpy as np import pandas as pd import time from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \ - save_dataframe_to_s3_parquet, save_pickle_to_s3 + save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3 from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer @@ -1976,7 +1976,7 @@ def updated_version(): # Read in data ######################################################################## asset_list = read_asset_list() - asset_list = merge_uprn_to_asset_list(asset_list) + asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list) # Read in the properties that have been included in Osmosis' wave 2.1 osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1() @@ -2043,6 +2043,8 @@ def updated_version(): right_on="Address ID" ) + # Pull in the EPC data + def read_asset_list(): asset_list = pd.read_excel( @@ -2185,7 +2187,7 @@ def merge_uprn_to_asset_list(asset_list): on=["internal_id", "external_address_id"] ) - return asset_list + return asset_list, uprn_lookup_2 def read_omosis_wave_2_1(): @@ -2234,3 +2236,31 @@ def read_stonewater_asset_data(): priority_postcodes = priority_postcodes["Postcode"].tolist() return priority_postcodes, previous_waves_address_id, master_sheet + + +def read_epc_data(uprn_lookup_2): + epc_data = json.loads( + read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/epc_data.json" + ) + ) + epc_data = pd.DataFrame(epc_data) + + epc_data["uprn"] = np.where( + epc_data["internal_id"] == 1091, + 83143766, + epc_data["uprn"] + ) + + # We drop come EPCS + epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)] + + epc_data_batch_2 = read_pickle_from_s3( + s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", + bucket_name="retrofit-data-dev" + ) + + complete_epcs = pd.concat([epc_data, epc_data_batch_2]) + + return complete_epcs