Adding epc data

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-23 13:18:21 +01:00
parent 95ff513f80
commit 0df3394c6c

View file

@ -13,7 +13,7 @@ import numpy as np
import pandas as pd
import time
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
save_dataframe_to_s3_parquet, save_pickle_to_s3
save_dataframe_to_s3_parquet, save_pickle_to_s3, read_pickle_from_s3
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
@ -1976,7 +1976,7 @@ def updated_version():
# Read in data
########################################################################
asset_list = read_asset_list()
asset_list = merge_uprn_to_asset_list(asset_list)
asset_list, uprn_lookup_2 = merge_uprn_to_asset_list(asset_list)
# Read in the properties that have been included in Osmosis' wave 2.1
osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
@ -2043,6 +2043,8 @@ def updated_version():
right_on="Address ID"
)
# Pull in the EPC data
def read_asset_list():
asset_list = pd.read_excel(
@ -2185,7 +2187,7 @@ def merge_uprn_to_asset_list(asset_list):
on=["internal_id", "external_address_id"]
)
return asset_list
return asset_list, uprn_lookup_2
def read_omosis_wave_2_1():
@ -2234,3 +2236,31 @@ def read_stonewater_asset_data():
priority_postcodes = priority_postcodes["Postcode"].tolist()
return priority_postcodes, previous_waves_address_id, master_sheet
def read_epc_data(uprn_lookup_2):
epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/epc_data.json"
)
)
epc_data = pd.DataFrame(epc_data)
epc_data["uprn"] = np.where(
epc_data["internal_id"] == 1091,
83143766,
epc_data["uprn"]
)
# We drop come EPCS
epc_data = epc_data[epc_data["internal_id"].isin(uprn_lookup_2["internal_id"].values)]
epc_data_batch_2 = read_pickle_from_s3(
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
return complete_epcs