Model/etl/customers/stonewater/potential_eco_properties.py
2025-02-12 10:14:14 +00:00

542 lines
22 KiB
Python

import os
import time
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from utils.s3 import read_from_s3, read_pickle_from_s3
import msoffcrypto
from io import BytesIO
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
def get_data(asset_list):
epc_data = []
errors = []
for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
try:
postcode = home["Postcode"]
house_number = home["Number"]
full_address = home["Full Address"]
searcher = SearchEpc(
address1=str(house_number),
postcode=postcode,
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
property_type=None,
fast=True,
full_address=full_address,
max_retries=5
)
# Force the skipping of estimating the EPC
searcher.ordnance_survey_client.property_type = None
searcher.ordnance_survey_client.built_form = None
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
continue
# Look for EPC recommendatons
try:
property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
except:
property_recommendations = {"rows": []}
epc = {
"row_id": home["row_id"],
**searcher.newest_epc.copy(),
"recommendations": property_recommendations["rows"]
}
epc_data.append(epc)
except Exception as e:
errors.append(home["row_id"])
time.sleep(5)
return epc_data, errors
def app():
"""
This code creates a list of cavity properties, for review
"""
# Read in the password protected master
# TODO: This file should be deleted!
# Path to the password-protected Excel file
file_path = ("/Users/khalimconn-kowlessar/Downloads/STONEWATER MASTER SHEET - UPDATED 20.5.24 - K- PASSWORD "
"PROTECTED.xlsx")
password = "STONE123" # Replace with the actual password
# Open the file and decrypt it
with open(file_path, "rb") as f:
decrypted_file = BytesIO()
office_file = msoffcrypto.OfficeFile(f)
office_file.load_key(password=password)
office_file.decrypt(decrypted_file)
# Read the decrypted file into a DataFrame
eco_rolling_master = pd.read_excel(decrypted_file, sheet_name="Sheet1", engine="openpyxl")
eco_rolling_master = eco_rolling_master[
~eco_rolling_master['INSTALL/CANCELLATION DATE'].str.contains("CANCELLED")
]
archetyped_properties = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 - "
"Archetyped V3.1.xlsx",
header=4
)
cavity_descriptions = [
"Cavity: AsBuilt (1983-1995)",
"Cavity: AsBuilt (Post 1995)",
"Cavity: AsBuilt (Pre 1976)",
"Cavity: AsBuilt (1976-1982)",
]
archetyped_properties["Is Cavity Property"] = archetyped_properties["Wall Type"].isin(cavity_descriptions)
# We also identify any properties where properties were found to need cavity wall insulation
costed_packages = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages "
"20241030 (WIP) Single Model V2.xlsx",
sheet_name="Modelled Packages",
header=13
)
needs_cwi = costed_packages[
costed_packages["Main Wall Insulation"].isin(
[
"Poss Extract CWI & Refill (issues identified)",
"CWI RdSAP Default"
]
)
][["Address ID", "Address", "Current SAP Rating", "Current EPC Band", "Postcode", "Archetype ID",
"Main Wall Insulation",
"Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]]
# We flag these properties
archetyped_properties["Survey shows CWI needed for Archetype"] = archetyped_properties["Archetype ID"].isin(
needs_cwi["Archetype ID"]
)
archetyped_properties = archetyped_properties[~pd.isnull(archetyped_properties["Address ID"])]
archetyped_properties = archetyped_properties[archetyped_properties["Address ID"] != "Address ID"]
# this is the big list!!!
features = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
"master sheet.csv",
encoding='latin1'
)
features["Address ID"] = features["Address ID"].astype(str)
features_to_merge = features[
[
"Address ID", "Organisation Reference", "Age", "Property Type", "Walls", "Roofs", "Glazing", "Heating",
"Main Fuel",
"Hot Water",
"Renewables", "Total Floor Area"
]
]
stonewater_cavity_properties = archetyped_properties[
["Name", "Postcode", "Osm. ID", "Org. ref.", "Address ID", "UPRN", "UDPRN", "Archetype ID", "House no",
"Street name",
"Address line 2", "City/Town", "Is Cavity Property", "Survey shows CWI needed for Archetype"]
].merge(
features_to_merge, how="left", on="Address ID"
)
# We filter this down to the properties that are cavity properties
stonewater_cavity_properties = stonewater_cavity_properties[
stonewater_cavity_properties["Is Cavity Property"] |
stonewater_cavity_properties["Survey shows CWI needed for Archetype"]
]
stonewater_cavity_properties["Reason Included"] = "As Built Cavity Property"
stonewater_cavity_properties["Reason Included"] = np.where(
stonewater_cavity_properties["Survey shows CWI needed for Archetype"] &
~stonewater_cavity_properties["Is Cavity Property"],
"Survey revealed potential need for CWI or extract and re-fill",
stonewater_cavity_properties["Reason Included"]
)
stonewater_cavity_properties["Reason Included"] = np.where(
stonewater_cavity_properties["Survey shows CWI needed for Archetype"] &
stonewater_cavity_properties["Is Cavity Property"],
"Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property",
stonewater_cavity_properties["Reason Included"]
)
# We indicate the exact properties that need CWI, based on survey findings
stonewater_cavity_properties["Reason Included"] = np.where(
stonewater_cavity_properties["Address ID"].isin(
needs_cwi[needs_cwi["Main Wall Insulation"] == "CWI RdSAP Default"]["Address ID"].astype(int).astype(
str).values
),
"Survey showed this property needs CWI",
stonewater_cavity_properties["Reason Included"]
)
stonewater_cavity_properties["Reason Included"] = np.where(
stonewater_cavity_properties["Address ID"].isin(
needs_cwi[needs_cwi["Main Wall Insulation"] == "Poss Extract CWI & Refill (issues identified)"][
"Address ID"].astype(int).astype(str).values
),
"Survey showed this property could need extract and re-fill",
stonewater_cavity_properties["Reason Included"]
)
# We flag units that were installed under ECO3
numeric_ids = eco_rolling_master[eco_rolling_master["STONEWATER UPRN"] != "NOT ON ASSET LIST"]
numeric_ids = numeric_ids[~pd.isnull(numeric_ids["STONEWATER UPRN"])]
numeric_ids["STONEWATER UPRN"] = numeric_ids["STONEWATER UPRN"].astype(int)
stonewater_cavity_properties["Installed under ECO3"] = stonewater_cavity_properties["Org. ref."].isin(
numeric_ids['STONEWATER UPRN'].values
)
# Which postcodes were installed under ECO3
priority_list_eco3 = stonewater_cavity_properties[
stonewater_cavity_properties["Installed under ECO3"]
]["Postcode"].unique()
# These are properties that were not installed under ECO3, that have the same postcodes as properties
# installed under ECO3
# These are 66 properties we might want to start with as an immediate priority
stonewater_cavity_properties["Same Postcode as Installed under ECO3"] = (
~stonewater_cavity_properties["Installed under ECO3"] & (
stonewater_cavity_properties["Postcode"].isin(priority_list_eco3)
)
)
stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str)
# Find the postcodes where an Osmosis survey revealed a need for CWI
postcodes_found_needing_cwi = stonewater_cavity_properties[
stonewater_cavity_properties["Reason Included"].isin(
[
"Survey revealed potential need for CWI or extract and re-fill",
"Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property",
"Survey showed this property needs CWI",
"Survey showed this property could need extract and re-fill"
]
)
]["Postcode"].unique()
stonewater_cavity_properties["Suspected Needs CWI - not surveyed"] = (
(
stonewater_cavity_properties[
"Postcode"].isin(
postcodes_found_needing_cwi)
) & (
~stonewater_cavity_properties[
"Reason Included"].isin(
[
"Survey revealed potential need "
"for CWI or extract and re-fill",
"Surveyed revealed potential "
"need for CWI or extract and "
"re-fill and is an as built "
"cavity property",
"Survey showed this property "
"needs CWI",
"Survey showed this property "
"could need extract and re-fill"
]
)
)
)
# Merge the EPCs on, with the data we need
stonewater_cavity_properties = stonewater_cavity_properties.rename(
columns={
"Age": "Parity - Build Age",
"Property Type": "Parity - Property Type",
"Walls": "Parity - Wall Construction",
"Roofs": "Parity - Roof Construction",
"Glazing": "Parity - Glazing Type",
"Heating": "Parity - Heating Type",
"Main Fuel": "Parity - Main Fuel",
"Hot Water": "Parity - Hot Water",
"Renewables": "Parity - Renewables",
"Total Floor Area": "Parity - Total Floor Area"
}
)
# We now flag the additional properties in the as built list
additional_properties = features[
~features["Address ID"].isin(archetyped_properties["Address ID"].values)
]
# Filter on as built cavity properties
additional_properties = additional_properties[
additional_properties["Walls"].isin(cavity_descriptions)
]
additional_properties["Full Address"] = additional_properties["Address"].copy()
house_numbers = []
for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)):
house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"])
if house_no is None:
house_no = x["Address"].split(",")[0]
# If we end up with a number like "01" we need to remove the leading zero
house_no = house_no.lstrip("0")
house_numbers.append(
{
"Address ID": x["Address ID"],
"Number": house_no
}
)
house_numbers = pd.DataFrame(house_numbers)
additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID")
additional_properties["row_id"] = additional_properties["Address ID"].copy()
# Flag any units in this list that were installed under ECO3
additional_properties["Installed under ECO3"] = additional_properties["Organisation Reference"].isin(
numeric_ids['STONEWATER UPRN'].values
)
# Additional list ECO3
additional_list_eco3 = additional_properties[additional_properties["Installed under ECO3"]]["Postcode"].unique()
# These are properties that were not installed under ECO3, that have the same postcodes as properties
# installed under ECO3
# These are 297 properties we might want to start with as an immediate priority
additional_properties["Same Postcode as Installed under ECO3"] = (
~additional_properties["Installed under ECO3"] & (
additional_properties["Postcode"].isin(additional_list_eco3)
)
)
# We do some additional manual checks, for ECO3 properties that were installed that didn't get matched to either
# dataaset
numeric_ids["In asset list"] = numeric_ids["STONEWATER UPRN"].isin(
stonewater_cavity_properties['Org. ref.'].astype(int).values
)
numeric_ids["In asset list"] = numeric_ids["In asset list"] | (
numeric_ids["STONEWATER UPRN"].isin(
additional_properties['Organisation Reference'].astype(int).values
)
)
# eco3_installs_not_in_asset_list = numeric_ids[~numeric_ids["In asset list"]]
# # We now take samples of properties randomly and manually check the ID against the asset list
# print(eco3_installs_not_in_asset_list.sample(1)[["STONEWATER UPRN", "Post Code", "NO ", "Street / Block Name", ]])
# # Checked STONEWATER UPRN
# # 9862, BH15 1NR, 33, THE QUAY FOYER [x]
# # 12785, S01 66PN, 57, SEACOLE GARDENS [x]
# # 26071, MK42 0TE, 51, De Havilland Avenue, Shortstown [x]
# # 18213, HR6 9UW, 20 Ford Street [x]
# # 24344, LU4 9FF, 6 SEAL CLOSE [x]
# # 31222, SN14 0QZ, 7 HARDBROOK COURT [x]
# # 9343, SP4 7XL, 10 OAK PLACE [x]
# # 34730, LU5 5TN, 4 TUDOR DRIVE [x]
# # 7021, BN27 2BZ, 32 BUTTS FIELD []
#
# stonewater_cavity_properties[stonewater_cavity_properties['Org. ref.'] == 7021]
# stonewater_cavity_properties[stonewater_cavity_properties['Postcode'] == "BN27 2BZ"]["Name"]
#
# additional_properties[additional_properties['Organisation Reference'] == 7021]
# additional_properties[additional_properties['Postcode'] == "BN27 2BZ"][["Address"]]
# Pull the EPCs for these properties
# additional_properties_epcs, errors = get_data(additional_properties)
# Save this data as a pickle
# import pickle
# with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl",
# "wb") as f:
# pickle.dump(additional_properties_epcs, f)
additional_properties["Suspected Needs CWI - not surveyed"] = (
(
additional_properties["Postcode"].isin(postcodes_found_needing_cwi) &
~additional_properties["Installed under ECO3"]
)
)
# We drop Full Address
additional_properties = additional_properties.drop(columns=["Full Address"])
additional_properties2 = additional_properties[[
"Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
"Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3',
'Same Postcode as Installed under ECO3', "Organisation Reference",
]].rename(
columns={
"Organisation Reference": "Org. ref.",
"SAP": "Parity - Predicted SAP",
"SAP Band": "Parity - Predicted SAP Band",
"Age": "Parity - Build Age",
"Property Type": "Parity - Property Type",
"Walls": "Parity - Wall Construction",
"Roofs": "Parity - Roof Construction",
"Glazing": "Parity - Glazing Type",
"Heating": "Parity - Heating Type",
"Main Fuel": "Parity - Main Fuel",
"Hot Water": "Parity - Hot Water",
"Renewables": "Parity - Renewables",
"Total Floor Area": "Parity - Total Floor Area"
}
)
# Combine the data:
stonewater_cavity_properties2 = stonewater_cavity_properties.merge(
features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference"
)
full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2])
full_dataset = full_dataset.drop(columns=['Osm. ID'])
# We not define the priority list for non-intrusives
full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2]
full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0]
# Strip out anything we definitely don't want
full_dataset = full_dataset[~full_dataset["Installed under ECO3"]]
areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique()
priorities = full_dataset[
full_dataset["Postal Region 2"].isin(areas)
]
region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index()
region_prevalance = region_prevalance[region_prevalance["count"] > 100]
df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)]
df["Postal Region"].value_counts()
df["Postal Region 2"].value_counts()
if df["Installed under ECO3"].sum():
raise ValueError("There are properties in the priority list that were installed under ECO3")
df.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - "
"revised list.csv",
index=False
)
# We save the data locally
# stonewater_cavity_properties.to_csv(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority "
# "postcodes.csv",
# index=False
# )
# additional_properties2.to_csv(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - "
# "non-priority postcodes.csv",
# index=False
# )
# # Save the survey findings
# needs_cwi.to_csv(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI -
# WIP.csv",
# index=False
# )
def cross_reference_epc_programme():
eco3_fallout = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE "
"SURVEYED - ECO3 NOT COMPLETED.xlsx"
)
for _, x in eco3_fallout.iterrows():
house_no = SearchEpc.get_house_number(x["ADDRESS"], "")
if house_no is None:
house_no = x["ADDRESS"].split(",")[0]
x["house_number"] = house_no
eco3_fallout["house_number"] = eco3_fallout.apply(
lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
)
# for _, x in eco3_fallout.ite
stonewater_modelled_above_c = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
"master sheet.csv",
encoding='latin1'
)
stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply(
lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1
)
eco3_fallout_matched_to_above_c = []
for _, property in eco3_fallout.iterrows():
# Match on house number
match = stonewater_modelled_above_c[
stonewater_modelled_above_c["house_number"] == property["house_number"]
]
# We do a fuzzy match on the address, with levenstein distance
from fuzzywuzzy import fuzz
match = stonewater_modelled_above_c[
stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
]
match.head()
def finalise_list_for_non_intrusives():
non_intrusives_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater "
"Non-Intrusives.xlsx"
)
# Remove anything installed under ECO3
non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]]
# We make any properties that were surveyed by Osmosis
packages = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 "
"(1).xlsx",
header=13,
sheet_name="Modelled Packages"
)
non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin(
packages["Address ID"].values
)
# Removed 54 addresses
final_non_intrusives = non_intrusives_list[
~non_intrusives_list["Surveyed by Osmosis"]
]
features = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
"master sheet.csv",
encoding='latin1'
)
# Add on the orgnisaion reference
final_non_intrusives = final_non_intrusives.merge(
features[["Organisation Reference", "Address ID"]],
how="left",
on="Address ID"
)
final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2]
selected_regions = final_non_intrusives[
final_non_intrusives["Include in non-intrusives"]
]["Postcode"].unique()
final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions)
# Filter down:
final_non_intrusives = final_non_intrusives[
final_non_intrusives["Is in region"]
]
final_non_intrusives.to_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives "
"List - final.xlsx")