mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
stonewater revision wip
This commit is contained in:
parent
688ff0588c
commit
95ff513f80
3 changed files with 366 additions and 12 deletions
|
|
@ -133,7 +133,7 @@ def app():
|
|||
energy_consumption_data = []
|
||||
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
|
||||
# Skip the first 50
|
||||
if i < 127:
|
||||
if i < 245:
|
||||
continue
|
||||
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
|
|
|
|||
|
|
@ -1429,17 +1429,17 @@ def compile_data_final():
|
|||
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
|
||||
# Store in S3
|
||||
# TODO - read in instead of running
|
||||
save_pickle_to_s3(
|
||||
data=epc_data_batch_2,
|
||||
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
save_pickle_to_s3(
|
||||
data=older_epcs_batch_2,
|
||||
s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
# save_pickle_to_s3(
|
||||
# data=epc_data_batch_2,
|
||||
# s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
#
|
||||
# save_pickle_to_s3(
|
||||
# data=older_epcs_batch_2,
|
||||
# s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
|
||||
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
|
||||
|
|
@ -1799,6 +1799,10 @@ def compile_data_final():
|
|||
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
|
||||
]
|
||||
|
||||
additional_features = [
|
||||
|
||||
]
|
||||
|
||||
# Define the preprocessing for numerical and categorical features
|
||||
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
|
|
@ -1957,3 +1961,276 @@ def pull_ideal_postcodes(missing_uprn_with_udprn):
|
|||
result["result"]
|
||||
)
|
||||
completed_id += 1
|
||||
|
||||
|
||||
def updated_version():
|
||||
"""
|
||||
This version of the clustering factors in the updates recieved from Stonewater to simplify the archetyping process
|
||||
using fewer variables and also factoring in their internal data sources
|
||||
|
||||
This work began on the 23rd July 2024
|
||||
:return:
|
||||
"""
|
||||
|
||||
########################################################################
|
||||
# Read in data
|
||||
########################################################################
|
||||
asset_list = read_asset_list()
|
||||
asset_list = merge_uprn_to_asset_list(asset_list)
|
||||
|
||||
# Read in the properties that have been included in Osmosis' wave 2.1
|
||||
osmosis_wave_2_1_asset_ids, osmosis_wave_2_1 = read_omosis_wave_2_1()
|
||||
|
||||
asset_list["In Osmosis Wave 2.1"] = asset_list["customer_asset_id"].isin(osmosis_wave_2_1_asset_ids)
|
||||
|
||||
# We also check the address & postcode
|
||||
asset_list["In Osmosis Wave 2.1"] = np.where(
|
||||
asset_list["address1"].isin(osmosis_wave_2_1["Name"]),
|
||||
True,
|
||||
asset_list["In Osmosis Wave 2.1"]
|
||||
)
|
||||
|
||||
priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data()
|
||||
|
||||
# Filter the asset list down to the priority postcodes
|
||||
asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes)
|
||||
|
||||
master_sheet = master_sheet[
|
||||
master_sheet["Address ID"].isin(
|
||||
asset_list["external_address_id"].values
|
||||
)
|
||||
]
|
||||
|
||||
master_sheet["days_since_lodgement"] = (
|
||||
datetime.now() - pd.to_datetime(master_sheet["Lodgement Date"], errors="coerce", dayfirst=True)
|
||||
).dt.days
|
||||
|
||||
asset_list = asset_list.drop(columns=["Lodgement Date"]).merge(
|
||||
master_sheet[["Address ID", "days_since_lodgement", "Lodgement Date", "EPC Rating"]],
|
||||
how="left",
|
||||
left_on="external_address_id",
|
||||
right_on="Address ID"
|
||||
)
|
||||
|
||||
# Flag properties that were surveyed within the last 5 years
|
||||
asset_list["epc_within_5_years"] = asset_list["days_since_lodgement"] < 5 * 365
|
||||
|
||||
# Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already
|
||||
# a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when
|
||||
# the EPC is done
|
||||
asset_list["is_epc_c_or_above"] = (
|
||||
((asset_list["EPC Rating"] >= 69) & asset_list["epc_within_5_years"]) |
|
||||
(asset_list["EPC Rating"] >= 80)
|
||||
)
|
||||
|
||||
clustering_features = asset_list[
|
||||
asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"]
|
||||
][
|
||||
[
|
||||
"internal_id", "customer_asset_id", "postcode", "house_number", "address1", "address2", "city_town",
|
||||
"county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date", "epc_within_5_years",
|
||||
"EPC Rating"
|
||||
]
|
||||
]
|
||||
|
||||
# Merge on the SAP data
|
||||
clustering_features = clustering_features.merge(
|
||||
master_sheet[
|
||||
["Address ID", "SAP"]
|
||||
].rename(columns={"SAP": "parity_modelled_sap"}),
|
||||
how="left",
|
||||
left_on="external_address_id",
|
||||
right_on="Address ID"
|
||||
)
|
||||
|
||||
|
||||
def read_asset_list():
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
||||
header=4
|
||||
)
|
||||
|
||||
udprn_data = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/UDPRN updated RA Sample for 5 year programme.xlsx", header=0
|
||||
)[["AddressId", "UDPRN"]].rename(columns={"AddressId": "Address ID"})
|
||||
udprn_data["UDPRN"] = udprn_data["UDPRN"].astype("Int64").astype(str)
|
||||
udprn_data["Address ID"] = udprn_data["Address ID"].astype(str)
|
||||
|
||||
asset_list = asset_list.merge(udprn_data, how="inner", on="Address ID")
|
||||
asset_list = asset_list.rename(columns={"UDPRN": "udprn"})
|
||||
|
||||
asset_list = asset_list.rename(
|
||||
columns={
|
||||
"Osm. ID": "internal_id",
|
||||
"Org. ref.": "customer_asset_id",
|
||||
"Postcode": "postcode",
|
||||
"House no": "house_number",
|
||||
"Name": "address1",
|
||||
"Address line 2": "address2",
|
||||
"City/Town": "city_town",
|
||||
"County": "county",
|
||||
"Address ID": "external_address_id",
|
||||
"Owning body": "owner"
|
||||
}
|
||||
)
|
||||
|
||||
asset_list["full_address"] = np.where(
|
||||
~pd.isnull(asset_list["address2"]),
|
||||
(
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["address2"] + ", " +
|
||||
asset_list["city_town"].str.title() + ", " +
|
||||
asset_list["postcode"]
|
||||
),
|
||||
asset_list["address1"] + ", " +
|
||||
asset_list["city_town"].str.title() + ", " +
|
||||
asset_list["postcode"]
|
||||
)
|
||||
return asset_list
|
||||
|
||||
|
||||
def merge_uprn_to_asset_list(asset_list):
|
||||
# Read in the lookups
|
||||
uprn_lookup_1 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup.json"
|
||||
)))
|
||||
uprn_lookup_1["match_type"] = "Exact"
|
||||
|
||||
uprn_lookup_2 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="scustomers/Stonewater/clustering/address_uprn_udprn_lookup_2.json"
|
||||
)))
|
||||
uprn_lookup_2 = uprn_lookup_2.rename(
|
||||
columns={
|
||||
"epc_address": "standardised_address",
|
||||
"epc_postcode": "standardised_postcode"
|
||||
}
|
||||
)
|
||||
uprn_lookup_2["match_type"] = "EPC"
|
||||
uprn_lookup_2["uprn"] = np.where(
|
||||
uprn_lookup_2["internal_id"] == 1091,
|
||||
83143766,
|
||||
uprn_lookup_2["uprn"]
|
||||
)
|
||||
|
||||
uprn_lookup_3 = pd.DataFrame(json.loads(read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/ideal-postcodes_pull_2.json"
|
||||
)))
|
||||
uprn_lookup_3["standardised_address"] = uprn_lookup_3[["line_1", "line_2", "line_3", "district", "postcode"]].apply(
|
||||
concatenate_row, axis=1
|
||||
)
|
||||
uprn_lookup_3 = uprn_lookup_3[
|
||||
["udprn", "uprn", "standardised_address", "postcode"]
|
||||
].rename(columns={"postcode": "standardised_postcode"})
|
||||
uprn_lookup_3["match_type"] = "Exact"
|
||||
|
||||
uprn_lookup_4_basis = pd.read_csv("manual_fix_uprns-populated.csv", index_col=False)
|
||||
uprn_lookup_4_basis["os_option_1_uprn"] = uprn_lookup_4_basis["os_option_1_uprn"].astype(str)
|
||||
uprn_lookup_4_basis["os_option_2_uprn"] = uprn_lookup_4_basis["os_option_2_uprn"].astype("Int64").astype(str)
|
||||
# prepare lookup 4
|
||||
uprn_lookup_4 = []
|
||||
for _, x in uprn_lookup_4_basis.iterrows():
|
||||
|
||||
property_type = None
|
||||
built_form = None
|
||||
if x["option"] == 1:
|
||||
uprn = x["os_option_1_uprn"]
|
||||
standardised_address = x["os_option_1_address"]
|
||||
postcode = x["os_option_1_postcode"]
|
||||
elif x["option"] == 2:
|
||||
uprn = x["os_option_2_uprn"]
|
||||
standardised_address = x["os_option_2_address"]
|
||||
postcode = x["os_option_2_address"].split(", ")[-1]
|
||||
else:
|
||||
uprn = x["manual_uprn"]
|
||||
standardised_address = x["manual_address"]
|
||||
postcode = x["manual_postcode"]
|
||||
|
||||
uprn_lookup_4.append(
|
||||
{
|
||||
"internal_id": x["internal_id"],
|
||||
"external_address_id": x["external_address_id"],
|
||||
"uprn": uprn,
|
||||
"standardised_address": standardised_address,
|
||||
"standardised_postcode": postcode,
|
||||
"property_type": property_type,
|
||||
"built_form": built_form
|
||||
}
|
||||
)
|
||||
uprn_lookup_4 = pd.DataFrame(uprn_lookup_4)
|
||||
uprn_lookup_4["match_type"] = "Fuzzy"
|
||||
|
||||
# concat
|
||||
uprn_lookup = pd.concat([uprn_lookup_1, uprn_lookup_2])
|
||||
|
||||
assert len(uprn_lookup) + len(uprn_lookup_3) + len(uprn_lookup_4) == len(asset_list)
|
||||
|
||||
# Final preps of lookups
|
||||
uprn_lookup_3["udprn"] = uprn_lookup_3["udprn"].astype(str)
|
||||
uprn_lookup_3 = uprn_lookup_3.merge(
|
||||
asset_list[["udprn", "internal_id", "external_address_id"]], how="left", on="udprn"
|
||||
)
|
||||
uprn_lookup = pd.concat([
|
||||
uprn_lookup,
|
||||
uprn_lookup_3,
|
||||
uprn_lookup_4
|
||||
])
|
||||
uprn_lookup["external_address_id"] = uprn_lookup["external_address_id"].astype(str)
|
||||
|
||||
asset_list = asset_list.merge(
|
||||
uprn_lookup.drop(columns=["udprn"]),
|
||||
how="inner",
|
||||
on=["internal_id", "external_address_id"]
|
||||
)
|
||||
|
||||
return asset_list
|
||||
|
||||
|
||||
def read_omosis_wave_2_1():
|
||||
osmosis_wave_2_1 = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater Osmosis SHDF 2.1.xlsx",
|
||||
header=4,
|
||||
)
|
||||
# Remove double spaces from "Name"
|
||||
osmosis_wave_2_1["Name"] = osmosis_wave_2_1["Name"].str.replace(" ", " ")
|
||||
|
||||
osmosis_wave_2_1 = osmosis_wave_2_1.rename(columns={"Unnamed: 1": "Location"})
|
||||
osmosis_wave_2_1 = osmosis_wave_2_1[osmosis_wave_2_1["Location"] != "Removed from program"]
|
||||
# We produce a cleaned list of asset ids from osmosis_wave_2_1
|
||||
osmosis_wave_2_1_asset_ids = [x for x in osmosis_wave_2_1["Asset ID"].values if not pd.isnull(x)]
|
||||
# We have some ids that are in the form 'id1, id2' so we split them
|
||||
osmosis_wave_2_1_asset_ids = [int(x.strip()) for id_str in osmosis_wave_2_1_asset_ids for x in id_str.split(",")]
|
||||
|
||||
return osmosis_wave_2_1_asset_ids, osmosis_wave_2_1
|
||||
|
||||
|
||||
def read_stonewater_asset_data():
|
||||
master_sheet = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - master "
|
||||
"sheet.csv",
|
||||
encoding='latin1'
|
||||
)
|
||||
|
||||
master_sheet["Address ID"] = master_sheet["Address ID"].astype(str)
|
||||
|
||||
previous_waves = master_sheet[
|
||||
(master_sheet["In Osmosis W2.1"] == "Yes") |
|
||||
(master_sheet["In Wates Wave 2.1"] == "Yes") |
|
||||
(master_sheet["In Liv Green Wave 2.1"] == "Yes") |
|
||||
(master_sheet["In CCS Wave 2.1"] == "Yes")
|
||||
].copy()
|
||||
|
||||
previous_waves_address_id = [str(x) for x in previous_waves["Address ID"].values if not pd.isnull(x)]
|
||||
|
||||
# We also read the priority postcodes
|
||||
priority_postcodes = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Osmosis Reviewed - Parity Download 18.7 - priority "
|
||||
"postcodes.csv",
|
||||
header=17
|
||||
)
|
||||
|
||||
priority_postcodes = priority_postcodes["Postcode"].tolist()
|
||||
|
||||
return priority_postcodes, previous_waves_address_id, master_sheet
|
||||
|
|
|
|||
77
etl/sfr/epc_average_by_postcode.py
Normal file
77
etl/sfr/epc_average_by_postcode.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from dotenv import load_dotenv
|
||||
from backend.SearchEpc import SearchEpc
|
||||
from backend.app.utils import sap_to_epc
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
This script will retrieve EPC data, for postcodes and produce statistics on the SAP Score
|
||||
:return:
|
||||
"""
|
||||
|
||||
source_file = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Addresses - SFR rents.xlsx")
|
||||
source_file["row_id"] = source_file.index
|
||||
# Split out the town, which is the final portion of the string, separated by commas
|
||||
source_file["Town"] = source_file["Address"].apply(lambda x: x.split(" ")[-1].strip() if not pd.isnull(x) else None)
|
||||
source_file["Address"] = source_file["Address"].apply(
|
||||
lambda x: " ".join(x.split(" ")[:-1]).strip() if not pd.isnull(x) else None
|
||||
)
|
||||
|
||||
unique_postcodes = source_file[["Address", "Postcode"]].drop_duplicates()
|
||||
|
||||
# for each postcode, pull EPC data
|
||||
collected_data = []
|
||||
no_data_found = []
|
||||
no_data_after_filters = []
|
||||
for _, config in tqdm(unique_postcodes.iterrows(), total=len(unique_postcodes)):
|
||||
address1 = config["Address"] if not pd.isnull(config["Address"]) else ""
|
||||
searcher = SearchEpc(
|
||||
postcode=config["Postcode"],
|
||||
address1=address1,
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key=""
|
||||
)
|
||||
while True:
|
||||
params = {
|
||||
"postcode": config["Postcode"],
|
||||
"address": address1,
|
||||
}
|
||||
results = searcher.client.domestic.search(params=params, size=10000)
|
||||
if not results:
|
||||
# We strip back address1
|
||||
address1 = " ".join(address1.split(" ")[:-1])
|
||||
if not address1:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
if not results:
|
||||
no_data_found.append(config)
|
||||
continue
|
||||
|
||||
data = pd.DataFrame(results["rows"])
|
||||
|
||||
data["current-energy-efficiency"] = data["current-energy-efficiency"].astype(int)
|
||||
# Take EPCs post 2023
|
||||
data["lodgement-date"] = pd.to_datetime(data["lodgement-date"], errors="coerce")
|
||||
data = data[data["lodgement-date"] >= "2023-01-01"]
|
||||
# Take private nrentals
|
||||
data = data[data["tenure"].isin(["rental (private)", "Rented (private)"])]
|
||||
|
||||
if data.empty:
|
||||
no_data_after_filters.append(config)
|
||||
continue
|
||||
|
||||
agg = data.groupby(["property-type", "built-form"])["current-energy-efficiency"].mean().reset_index()
|
||||
agg = agg.rename(columns={"current-energy-efficiency": "Average SAP"})
|
||||
agg["Average EPC"] = agg["Average SAP"].apply(sap_to_epc)
|
||||
agg.insert(0, "Postcode", config["Postcode"])
|
||||
agg.insert(0, "Address", address1)
|
||||
|
||||
collected_data.append(agg)
|
||||
Loading…
Add table
Reference in a new issue