stonewater revision wip

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-23 13:13:26 +01:00
parent 688ff0588c
commit 95ff513f80
3 changed files with 366 additions and 12 deletions

View file

@ -133,7 +133,7 @@ def app():
energy_consumption_data = []
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
# Skip the first 50
if i < 127:
if i < 245:
continue
data = pd.read_csv(directory / "certificates.csv", low_memory=False)

View file

@ -1429,17 +1429,17 @@ def compile_data_final():
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
# Store in S3
# TODO - read in instead of running
save_pickle_to_s3(
data=epc_data_batch_2,
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
save_pickle_to_s3(
data=older_epcs_batch_2,
s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
# save_pickle_to_s3(
# data=epc_data_batch_2,
# s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
# bucket_name="retrofit-data-dev"
# )
#
# save_pickle_to_s3(
# data=older_epcs_batch_2,
# s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
# bucket_name="retrofit-data-dev"
# )
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
@ -1799,6 +1799,10 @@ def compile_data_final():
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
]
additional_features = [
]
# Define the preprocessing for numerical and categorical features
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
@ -1957,3 +1961,276 @@ def pull_ideal_postcodes(missing_uprn_with_udprn):
result["result"]
)
completed_id += 1
def updated_version():
"""
This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
using fewer variables and also factoring in their internal data sources
This work began on the 23rd July 2024
:return:
"""
########################################################################
# Read in data
########################################################################
asset_list = read_asset_list()
asset_list = merge_uprn_to_asset_list(asset_list)
# Read in the properties that have been included in Osmosis' wave 2.1
osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)
# We also check the address & postcode
asset_list["In Osmosis Wave 2.1"] = np.where(
asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
True,
asset_list["In Osmosis Wave 2.1"]
)
priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()
# Filter the asset list down to the priority postcodes
asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)
master_sheet = master_sheet[
master_sheet["Address ID"].isin(
asset_list["external_address_id"].values
)
]
master_sheet["days_since_lodgement"] = (
datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
).dt.days
asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
how="left",
left_on="external_address_id",
right_on="Address ID"
)
# Flag properties that were surveyed within the last 5 years
asset_list["epc_within_5_years"] = asset_list["days_since_lodgement"] < 5 * 365
# Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
# a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
# the EPC is done
asset_list["is_epc_c_or_above"] = (
((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
(asset_list["EPC Rating"] >= 80)
)
clustering_features = asset_list[
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"]
][
[
"internal_id", "customer_asset_id", "postcode", "house_number", "address1", "address2", "city_town",
"county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date", "epc_within_5_years",
"EPC Rating"
]
]
# Merge on the SAP data
clustering_features = clustering_features.merge(
master_sheet[
["Address ID", "SAP"]
].rename(columns={"SAP": "parity_modelled_sap"}),
how="left",
left_on="external_address_id",
right_on="Address ID"
)
def read_asset_list():
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
asset_list = asset_list.rename(
columns={
"Osm. ID": "internal_id",
"Org. ref.": "customer_asset_id",
"Postcode": "postcode",
"House no": "house_number",
"Name": "address1",
"Address line 2": "address2",
"City/Town": "city_town",
"County": "county",
"Address ID": "external_address_id",
"Owning body": "owner"
}
)
asset_list["full_address"] = np.where(
~pd.isnull(asset_list["address2"]),
(
asset_list["address1"] + ", " +
asset_list["address2"] + ", " +
asset_list["city_town"].str.title() + ", " +
asset_list["postcode"]
),
asset_list["address1"] + ", " +
asset_list["city_town"].str.title() + ", " +
asset_list["postcode"]
)
return asset_list
def merge_uprn_to_asset_list(asset_list):
# Read in the lookups
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
)))
uprn_lookup_1["match_type"] = "Exact"
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
)))
uprn_lookup_2 = uprn_lookup_2.rename(
columns={
"epc_address": "standardised_address",
"epc_postcode": "standardised_postcode"
}
)
uprn_lookup_2["match_type"] = "EPC"
uprn_lookup_2["uprn"] = np.where(
uprn_lookup_2["internal_id"] == 1091,
83143766,
uprn_lookup_2["uprn"]
)
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
)))
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
concatenate_row, axis=1
)
uprn_lookup_3 = uprn_lookup_3[
["udprn", "uprn", "standardised_address", "postcode"]
].rename(columns={"postcode": "standardised_postcode"})
uprn_lookup_3["match_type"] = "Exact"
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
# prepare lookup 4
uprn_lookup_4 = []
for _, x in uprn_lookup_4_basis.iterrows():
property_type = None
built_form = None
if x["option"] == 1:
uprn = x["os_option_1_uprn"]
standardised_address = x["os_option_1_address"]
postcode = x["os_option_1_postcode"]
elif x["option"] == 2:
uprn = x["os_option_2_uprn"]
standardised_address = x["os_option_2_address"]
postcode = x["os_option_2_address"].split(", ")[-1]
else:
uprn = x["manual_uprn"]
standardised_address = x["manual_address"]
postcode = x["manual_postcode"]
uprn_lookup_4.append(
{
"internal_id": x["internal_id"],
"external_address_id": x["external_address_id"],
"uprn": uprn,
"standardised_address": standardised_address,
"standardised_postcode": postcode,
"property_type": property_type,
"built_form": built_form
}
)
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
uprn_lookup_4["match_type"] = "Fuzzy"
# concat
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
# Final preps of lookups
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
uprn_lookup_3 = uprn_lookup_3.merge(
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
)
uprn_lookup = pd.concat([
uprn_lookup,
uprn_lookup_3,
uprn_lookup_4
])
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
asset_list = asset_list.merge(
uprn_lookup.drop(columns=["udprn"]),
how="inner",
on=["internal_id", "external_address_id"]
)
return asset_list
def read_omosis_wave_2_1():
osmosis_wave_2_1 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
header=4,
)
# Remove double spaces from "Name"
osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace(" ", " ")
osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
# We produce a cleaned list of asset ids from osmosis_wave_2_1
osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
# We have some ids that are in the form 'id1, id2' so we split them
osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]
return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1
def read_stonewater_asset_data():
master_sheet = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
"sheet.csv",
encoding='latin1'
)
master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)
previous_waves = master_sheet[
(master_sheet["In Osmosis W2.1"] == "Yes") |
(master_sheet["In Wates Wave 2.1"] == "Yes") |
(master_sheet["In Liv Green Wave 2.1"] == "Yes") |
(master_sheet["In CCS Wave 2.1"] == "Yes")
].copy()
previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]
# We also read the priority postcodes
priority_postcodes = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
"postcodes.csv",
header=17
)
priority_postcodes = priority_postcodes["Postcode"].tolist()
return priority_postcodes, previous_waves_address_id, master_sheet

View file

@ -0,0 +1,77 @@
import os
from tqdm import tqdm
import pandas as pd
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from backend.app.utils import sap_to_epc
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
def app():
"""
This script will retrieve EPC data, for postcodes and produce statistics on the SAP Score
:return:
"""
source_file = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Addresses - SFR rents.xlsx")
source_file["row_id"] = source_file.index
# Split out the town, which is the final portion of the string, separated by commas
source_file["Town"] = source_file["Address"].apply(lambda x: x.split(" ")[-1].strip() if not pd.isnull(x) else None)
source_file["Address"] = source_file["Address"].apply(
lambda x: " ".join(x.split(" ")[:-1]).strip() if not pd.isnull(x) else None
)
unique_postcodes = source_file[["Address", "Postcode"]].drop_duplicates()
# for each postcode, pull EPC data
collected_data = []
no_data_found = []
no_data_after_filters = []
for _, config in tqdm(unique_postcodes.iterrows(), total=len(unique_postcodes)):
address1 = config["Address"] if not pd.isnull(config["Address"]) else ""
searcher = SearchEpc(
postcode=config["Postcode"],
address1=address1,
auth_token=EPC_AUTH_TOKEN,
os_api_key=""
)
while True:
params = {
"postcode": config["Postcode"],
"address": address1,
}
results = searcher.client.domestic.search(params=params, size=10000)
if not results:
# We strip back address1
address1 = " ".join(address1.split(" ")[:-1])
if not address1:
break
else:
break
if not results:
no_data_found.append(config)
continue
data = pd.DataFrame(results["rows"])
data["current-energy-efficiency"] = data["current-energy-efficiency"].astype(int)
# Take EPCs post 2023
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], errors="coerce")
data = data[data["lodgement-date"] >= "2023-01-01"]
# Take private nrentals
data = data[data["tenure"].isin(["rental (private)", "Rented (private)"])]
if data.empty:
no_data_after_filters.append(config)
continue
agg = data.groupby(["property-type", "built-form"])["current-energy-efficiency"].mean().reset_index()
agg = agg.rename(columns={"current-energy-efficiency": "Average SAP"})
agg["Average EPC"] = agg["Average SAP"].apply(sap_to_epc)
agg.insert(0, "Postcode", config["Postcode"])
agg.insert(0, "Address", address1)
collected_data.append(agg)