working on stonewater clustering pipeline

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-12 15:44:04 +01:00
parent 743422e8fe
commit 667ed1b990
3 changed files with 419 additions and 3 deletions

View file

@ -0,0 +1,156 @@
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from recommendations.recommendation_utils import (
estimate_perimeter,
estimate_external_wall_area,
estimate_number_of_floors
)
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
def app():
"""
This app is EPC pulling data for some properties owned by LHP
:return:
"""
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Places for People NORTH WEST - EPC DATA PULL REQUEST.xlsx", header=0
)
epc_data = []
for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
full_address = home["Address"]
address1 = home["AddressLine1"]
postcode = home["Postcode"]
searcher = SearchEpc(
address1=address1,
postcode=postcode,
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
property_type=None,
fast=True,
full_address=full_address
)
# Force the skipping of estimating the EPC
searcher.ordnance_survey_client.property_type = None
searcher.ordnance_survey_client.built_form = None
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
continue
epc = {
"asset_list_address": full_address,
**searcher.newest_epc.copy()
}
epc_data.append(epc)
epc_df = pd.DataFrame(epc_data)
# Retrieve just the data we need
epc_df = epc_df[
[
"asset_list_address",
"uprn",
"property-type",
"built-form",
"inspection-date",
"current-energy-rating",
"current-energy-efficiency",
"roof-description",
"walls-description",
"transaction-type",
# New fields needed
"secondheat-description",
"total-floor-area",
"construction-age-band",
"floor-height",
"number-habitable-rooms",
"mainheat-description"
]
]
# epc_df.to_csv("pfp sales data.csv", index=False)
asset_list = asset_list.merge(
epc_df,
how="left",
left_on=["Address"],
right_on=["asset_list_address"]
)
asset_list = asset_list.drop(columns=["asset_list_address"])
# Rename the columns
asset_list = asset_list.rename(columns={
"inspection-date": "Date of last EPC",
"current-energy-efficiency": "SAP score on register",
"current-energy-rating": "EPC rating on register",
"property-type": "EPC Property Type",
"built-form": "EPC Archetype",
"total-floor-area": "EPC Property Floor Area",
"construction-age-band": "EPC Property Age Band",
"floor-height": "EPC Property Floor Height",
"number-habitable-rooms": "EPC Number of Habitable Rooms",
"walls-description": "EPC Wall Construction",
"roof-description": "EPC Roof Construction",
"mainheat-description": "EPC Heating Type",
"secondheat-description": "EPC Secondary Heating",
"transaction-type": "Reason for last EPC"
})
asset_list["Estimated Number of Floors"] = asset_list.apply(
lambda x: estimate_number_of_floors(
property_type=x["EPC Property Type"]
) if not pd.isnull(x["EPC Property Type"]) else None, axis=1
)
asset_list["EPC Property Floor Area"] = asset_list["EPC Property Floor Area"].astype(float)
asset_list["EPC Number of Habitable Rooms"] = np.where(
asset_list["EPC Number of Habitable Rooms"] == "",
None,
asset_list["EPC Number of Habitable Rooms"]
)
asset_list["EPC Number of Habitable Rooms"] = asset_list["EPC Number of Habitable Rooms"].astype(float)
asset_list["Estimated Perimeter (m)"] = asset_list.apply(
lambda x: estimate_perimeter(
floor_area=x["EPC Property Floor Area"] / x["Estimated Number of Floors"],
num_rooms=x["EPC Number of Habitable Rooms"] / x["Estimated Number of Floors"],
), axis=1
)
asset_list["Estimated Heat Loss Perimeter (m)"] = asset_list.apply(
lambda x: estimate_external_wall_area(
num_floors=x["Estimated Number of Floors"],
floor_height=float(x["EPC Property Floor Height"]) if x["EPC Property Floor Height"] else 2.5,
perimeter=x["Estimated Perimeter (m)"],
built_form=x["EPC Archetype"]
),
axis=1
)
asset_list["Roof Insulation Thickness"] = asset_list.apply(
lambda x: RoofAttributes(description=x["EPC Roof Construction"]).process()[
"insulation_thickness"] if not pd.isnull(x["EPC Roof Construction"]) else None,
axis=1
)
# Store as an excel
filename = "Places for People NORTH WEST - EPC DATA PULL.xlsx"
asset_list.to_excel(filename, index=False)

View file

@ -10,11 +10,47 @@ from fuzzywuzzy import fuzz
import numpy as np
import pandas as pd
import time
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
# We create a MAP of uprns, for EPCs that didn't give use the UPRN
missing_uprn_map = [
# This is a map from internal_id to UPRN, for properties where we do have an EPC, but we don't have
# a uprn
# 1 Church Street, Alfreton, DE55 7AH
{"internal_id": 78, "mapped_uprn": None}, # Doesn't seem to exist any more
# 1 Granville Road, Luton, LU1 1PA
{"internal_id": 315, "mapped_uprn": 100080148856},
# 11 College Street, Birstall, Batley, WF17 9HF
# The EPC record is for 11 and 11a
{"internal_id": 1090, "mapped_uprn": 83190440},
# 11a College Street, Birstall, Batley, WF17 9HF
{"internal_id": 1092, "mapped_uprn": 83143766},
# Flat 5 Friars Street, Hereford, HR4 0AS
# TODO: Check this
{"internal_id": 1384, "mapped_uprn": 200002600892},
# This UPRN is for 5 Friars Court, which is a flat
# Flat 7 Friars Street, Hereford, HR4 0AS
# TODO: Check this
{"internal_id": 1385, "mapped_uprn": 200002600894},
# This UPRN is for 7 Friars Court, which is a flat
# 1 Waverley Street, Dudley, DY2 0YE
{"internal_id": 3349, "mapped_uprn": 90022438},
# 5 Brighton Road, Burgh Heath, Tadworth, KT20 6BQ
# TODO: Check this
# This UPRN is for 5 Copthorne, Brighton Road, Burgh Heath, KT20 6BQ, which is a flat
{"internal_id": 5027, "mapped_uprn": 100062145273},
# Room 1, 21 Coxford Road, Southampton, SO16 5FG
# This is for 21 Coxford Road
{"internal_id": 5554, "mapped_uprn": 100060692392},
]
missing_uprn_map = pd.DataFrame(missing_uprn_map)
internal_id_epcs_to_drop = [315, 1384, 1385, 3349]
def remove_commas_and_full_stops(input_string: str) -> str:
"""
@ -610,7 +646,58 @@ def compile_data():
header_row=4
)
# TODO: Read in UPRNs
# TODO: Read in UPRNs or UDPRN
epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/epc_data.json"
)
)
epc_data = pd.DataFrame(epc_data)
# We drop come EPCS
epc_data = epc_data[~epc_data["internal_id"].isin(internal_id_epcs_to_drop)]
# This we can use to produce additional variables such as number of old surveys
older_epc_data = json.loads(
read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
)
)
older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
# This is the first ordnance survey data pull
os_most_relevant_1 = []
os_all_1 = {}
for i in tqdm(["1", "2", "3"]):
most_relevant_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
)
os_most_relevant_1.extend(json.loads(most_relevant_segment))
os_all_segment = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
)
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
# This is the second ordnance survey data pull
os_most_relevant_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
)
os_most_relevant_2 = json.loads(os_most_relevant_2)
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
os_all_2 = read_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
)
os_all_2 = json.loads(os_all_2)
########################################################################
# Prepare asset list
@ -664,3 +751,176 @@ def compile_data():
if pd.isnull(asset_list["full_address"]).sum():
raise ValueError("Missing full addresses")
# Quick check to see if we have os data for every property that doesn't have an EPC
without_epc = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
os_most_relevant_1_internal_ids = os_most_relevant_1["internal_id"].tolist()
os_most_relevant_2_internal_ids = os_most_relevant_2["internal_id"].tolist()
missing_os_data = []
for _, x in without_epc.iterrows():
# We would prioritise the data pulled the second time around
internal_id = x["internal_id"]
if internal_id in os_most_relevant_2_internal_ids:
continue
if internal_id in os_most_relevant_1_internal_ids:
continue
missing_os_data.append(internal_id)
if len(missing_os_data):
raise Exception("We don't have SOME data for each internal_id")
# For the EPC data, some of them are missing UPRN
epc_data_to_address = asset_list[
asset_list["internal_id"].isin(epc_data["internal_id"].values)
][
["full_address", "internal_id"]].merge(
epc_data, how="left", on="internal_id"
)
missed_uprn = epc_data_to_address[epc_data_to_address["uprn"] == ""]
# Once we have UPRNs, we might want to pull in the EPC data again
# epc_data_with_uprn = []
# older_epc_data_with_uprn = {}
#
# for row_number, asset in tqdm(asset_list.iterrows(), total=len(asset_list)):
# searcher = SearchEpc(
# address1=str(asset["address1"]),
# postcode=str(asset["postcode"]),
# auth_token=EPC_AUTH_TOKEN,
# os_api_key="",
# full_address=str(asset["full_address"]),
# uprn=asset["uprn"]
# )
# searcher.find_property(skip_os=True)
#
# if searcher.newest_epc is None:
# continue
#
# epc_data_with_uprn.append(
# {
# "internal_id": asset["internal_id"],
# **searcher.newest_epc
# }
# )
#
# if searcher.older_epcs is not None:
# older_epc_data_with_uprn[asset["internal_id"]] = searcher.older_epcs
# We now get the remaining properties
# TODO: We might want to use epc_data_with_uprn
remaining_properties = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
# We estimate the data
final_epcs = []
for _, p in remaining_properties.iterrows():
internal_id = p["internal_id"]
uprn = p["UPRN"]
if internal_id in os_most_relevant_1_internal_ids:
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id].to_dict("records")[0]
p_os_full = os_all_1[str(internal_id)]
else:
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id].to_dict("records")[0]
p_os_full = os_all_2[str(internal_id)]
p_os_full = pd.DataFrame(
[x["DPA"] if "DPA" in x else x["LPI"] for x in p_os_full]
)
# TODO: Add this back in
# When we have this
if p["uprn"] != p_os_data["UPRN"]:
# Get it from the older data
filtered = p_os_full[p_os_full["UPRN"] == p["uprn"]]
p_os_data = filtered.to_dict("records")[0]
searcher = SearchEpc(
address1=str(p["address1"]),
postcode=str(p["postcode"]),
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
uprn=uprn
)
searcher.ordnance_survey_client.parse_classification_code(p_os_data["CLASSIFICATION_CODE"])
searcher.find_property(skip_os=True)
final_epcs.append(
{
"internal_id": internal_id,
**searcher.newest_epc
}
)
final_epcs = pd.DataFrame(final_epcs)
complete_epcs = pd.concat(
[
epc_data,
final_epcs
]
)
# We now pull additional data
uprns = complete_epcs["uprn"].tolist()
# We get the spatial file list and loop through each EPC and determine which file it needs.
# We then just read in the files that we need and get the data, for each uprn from that file
uprn_filenames = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
)
uprn_lookup = {}
for uprn in complete_epcs["uprn"]:
if not uprn:
# TODO: Do something about this!
continue
filtered_df = uprn_filenames[
(uprn_filenames["lower"] <= int(uprn))
& (uprn_filenames["upper"] >= int(uprn))
]
if filtered_df["filenames"].values[0] in uprn_lookup:
uprn_lookup[filtered_df["filenames"].values[0]].append(int(uprn))
else:
uprn_lookup[filtered_df["filenames"].values[0]] = [int(uprn)]
spatial_data_to_uprn = []
for filename, associated_uprn in tqdm(uprn_lookup.items(), total=len(uprn_lookup)):
# Read in the file
spatial_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}"
)
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
spatial_data_to_uprn.append(spatial_df)
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
# TODO: Let's store this in s3
save_data_to_s3(
data=json.dumps(spatial_data_to_uprn.to_dict("records")),
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
bucket_name="retrofit-data-dev"
)
# We merge this spatial data onto final EPCS
spatial_data_to_uprn = spatial_data_to_uprn.drop(
columns=["partition", "filename"]
).rename(columns={"UPRN": "uprn"})
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
property_attributes = complete_epcs.merge(
spatial_data_to_uprn,
how="left",
on="uprn"
)
# We drop the columns we don't care about for clustering
property_attributes = property_attributes.drop(
columns=[
]
)

View file

@ -45,7 +45,7 @@ class RoofAttributes(Definitions):
"""
self.description: str = description.lower().strip()
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or self.description == "sap05:roof"
self.welsh_translation_search()