mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on stonewater clustering pipeline
This commit is contained in:
parent
743422e8fe
commit
667ed1b990
3 changed files with 419 additions and 3 deletions
156
etl/customers/places_for_people/EPC data pull - 12th June.py
Normal file
156
etl/customers/places_for_people/EPC data pull - 12th June.py
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
import os
|
||||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from backend.SearchEpc import SearchEpc
|
||||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||
|
||||
from recommendations.recommendation_utils import (
|
||||
estimate_perimeter,
|
||||
estimate_external_wall_area,
|
||||
estimate_number_of_floors
|
||||
)
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
This app is EPC pulling data for some properties owned by LHP
|
||||
:return:
|
||||
"""
|
||||
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/Places for People NORTH WEST - EPC DATA PULL REQUEST.xlsx", header=0
|
||||
)
|
||||
|
||||
epc_data = []
|
||||
for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
||||
|
||||
full_address = home["Address"]
|
||||
|
||||
address1 = home["AddressLine1"]
|
||||
postcode = home["Postcode"]
|
||||
|
||||
searcher = SearchEpc(
|
||||
address1=address1,
|
||||
postcode=postcode,
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key="",
|
||||
property_type=None,
|
||||
fast=True,
|
||||
full_address=full_address
|
||||
)
|
||||
# Force the skipping of estimating the EPC
|
||||
searcher.ordnance_survey_client.property_type = None
|
||||
searcher.ordnance_survey_client.built_form = None
|
||||
|
||||
searcher.find_property(skip_os=True)
|
||||
if searcher.newest_epc is None:
|
||||
continue
|
||||
|
||||
epc = {
|
||||
"asset_list_address": full_address,
|
||||
**searcher.newest_epc.copy()
|
||||
}
|
||||
|
||||
epc_data.append(epc)
|
||||
|
||||
epc_df = pd.DataFrame(epc_data)
|
||||
|
||||
# Retrieve just the data we need
|
||||
epc_df = epc_df[
|
||||
[
|
||||
"asset_list_address",
|
||||
"uprn",
|
||||
"property-type",
|
||||
"built-form",
|
||||
"inspection-date",
|
||||
"current-energy-rating",
|
||||
"current-energy-efficiency",
|
||||
"roof-description",
|
||||
"walls-description",
|
||||
"transaction-type",
|
||||
# New fields needed
|
||||
"secondheat-description",
|
||||
"total-floor-area",
|
||||
"construction-age-band",
|
||||
"floor-height",
|
||||
"number-habitable-rooms",
|
||||
"mainheat-description"
|
||||
]
|
||||
]
|
||||
|
||||
# epc_df.to_csv("pfp sales data.csv", index=False)
|
||||
|
||||
asset_list = asset_list.merge(
|
||||
epc_df,
|
||||
how="left",
|
||||
left_on=["Address"],
|
||||
right_on=["asset_list_address"]
|
||||
)
|
||||
|
||||
asset_list = asset_list.drop(columns=["asset_list_address"])
|
||||
|
||||
# Rename the columns
|
||||
asset_list = asset_list.rename(columns={
|
||||
"inspection-date": "Date of last EPC",
|
||||
"current-energy-efficiency": "SAP score on register",
|
||||
"current-energy-rating": "EPC rating on register",
|
||||
"property-type": "EPC Property Type",
|
||||
"built-form": "EPC Archetype",
|
||||
"total-floor-area": "EPC Property Floor Area",
|
||||
"construction-age-band": "EPC Property Age Band",
|
||||
"floor-height": "EPC Property Floor Height",
|
||||
"number-habitable-rooms": "EPC Number of Habitable Rooms",
|
||||
"walls-description": "EPC Wall Construction",
|
||||
"roof-description": "EPC Roof Construction",
|
||||
"mainheat-description": "EPC Heating Type",
|
||||
"secondheat-description": "EPC Secondary Heating",
|
||||
"transaction-type": "Reason for last EPC"
|
||||
})
|
||||
|
||||
asset_list["Estimated Number of Floors"] = asset_list.apply(
|
||||
lambda x: estimate_number_of_floors(
|
||||
property_type=x["EPC Property Type"]
|
||||
) if not pd.isnull(x["EPC Property Type"]) else None, axis=1
|
||||
)
|
||||
|
||||
asset_list["EPC Property Floor Area"] = asset_list["EPC Property Floor Area"].astype(float)
|
||||
asset_list["EPC Number of Habitable Rooms"] = np.where(
|
||||
asset_list["EPC Number of Habitable Rooms"] == "",
|
||||
None,
|
||||
asset_list["EPC Number of Habitable Rooms"]
|
||||
)
|
||||
asset_list["EPC Number of Habitable Rooms"] = asset_list["EPC Number of Habitable Rooms"].astype(float)
|
||||
|
||||
asset_list["Estimated Perimeter (m)"] = asset_list.apply(
|
||||
lambda x: estimate_perimeter(
|
||||
floor_area=x["EPC Property Floor Area"] / x["Estimated Number of Floors"],
|
||||
num_rooms=x["EPC Number of Habitable Rooms"] / x["Estimated Number of Floors"],
|
||||
), axis=1
|
||||
)
|
||||
|
||||
asset_list["Estimated Heat Loss Perimeter (m)"] = asset_list.apply(
|
||||
lambda x: estimate_external_wall_area(
|
||||
num_floors=x["Estimated Number of Floors"],
|
||||
floor_height=float(x["EPC Property Floor Height"]) if x["EPC Property Floor Height"] else 2.5,
|
||||
perimeter=x["Estimated Perimeter (m)"],
|
||||
built_form=x["EPC Archetype"]
|
||||
),
|
||||
axis=1
|
||||
)
|
||||
|
||||
asset_list["Roof Insulation Thickness"] = asset_list.apply(
|
||||
lambda x: RoofAttributes(description=x["EPC Roof Construction"]).process()[
|
||||
"insulation_thickness"] if not pd.isnull(x["EPC Roof Construction"]) else None,
|
||||
axis=1
|
||||
)
|
||||
|
||||
# Store as an excel
|
||||
filename = "Places for People NORTH WEST - EPC DATA PULL.xlsx"
|
||||
asset_list.to_excel(filename, index=False)
|
||||
|
|
@ -10,11 +10,47 @@ from fuzzywuzzy import fuzz
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import time
|
||||
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3
|
||||
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
||||
# We create a MAP of uprns, for EPCs that didn't give use the UPRN
|
||||
missing_uprn_map = [
|
||||
# This is a map from internal_id to UPRN, for properties where we do have an EPC, but we don't have
|
||||
# a uprn
|
||||
# 1 Church Street, Alfreton, DE55 7AH
|
||||
{"internal_id": 78, "mapped_uprn": None}, # Doesn't seem to exist any more
|
||||
# 1 Granville Road, Luton, LU1 1PA
|
||||
{"internal_id": 315, "mapped_uprn": 100080148856},
|
||||
# 11 College Street, Birstall, Batley, WF17 9HF
|
||||
# The EPC record is for 11 and 11a
|
||||
{"internal_id": 1090, "mapped_uprn": 83190440},
|
||||
# 11a College Street, Birstall, Batley, WF17 9HF
|
||||
{"internal_id": 1092, "mapped_uprn": 83143766},
|
||||
# Flat 5 Friars Street, Hereford, HR4 0AS
|
||||
# TODO: Check this
|
||||
{"internal_id": 1384, "mapped_uprn": 200002600892},
|
||||
# This UPRN is for 5 Friars Court, which is a flat
|
||||
# Flat 7 Friars Street, Hereford, HR4 0AS
|
||||
# TODO: Check this
|
||||
{"internal_id": 1385, "mapped_uprn": 200002600894},
|
||||
# This UPRN is for 7 Friars Court, which is a flat
|
||||
# 1 Waverley Street, Dudley, DY2 0YE
|
||||
{"internal_id": 3349, "mapped_uprn": 90022438},
|
||||
# 5 Brighton Road, Burgh Heath, Tadworth, KT20 6BQ
|
||||
# TODO: Check this
|
||||
# This UPRN is for 5 Copthorne, Brighton Road, Burgh Heath, KT20 6BQ, which is a flat
|
||||
{"internal_id": 5027, "mapped_uprn": 100062145273},
|
||||
# Room 1, 21 Coxford Road, Southampton, SO16 5FG
|
||||
# This is for 21 Coxford Road
|
||||
{"internal_id": 5554, "mapped_uprn": 100060692392},
|
||||
|
||||
]
|
||||
missing_uprn_map = pd.DataFrame(missing_uprn_map)
|
||||
|
||||
internal_id_epcs_to_drop = [315, 1384, 1385, 3349]
|
||||
|
||||
|
||||
def remove_commas_and_full_stops(input_string: str) -> str:
|
||||
"""
|
||||
|
|
@ -610,7 +646,58 @@ def compile_data():
|
|||
header_row=4
|
||||
)
|
||||
|
||||
# TODO: Read in UPRNs
|
||||
# TODO: Read in UPRNs or UDPRN
|
||||
|
||||
epc_data = json.loads(
|
||||
read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/epc_data.json"
|
||||
)
|
||||
)
|
||||
epc_data = pd.DataFrame(epc_data)
|
||||
|
||||
# We drop come EPCS
|
||||
epc_data = epc_data[~epc_data["internal_id"].isin(internal_id_epcs_to_drop)]
|
||||
|
||||
# This we can use to produce additional variables such as number of old surveys
|
||||
older_epc_data = json.loads(
|
||||
read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/old_epc_data.json"
|
||||
)
|
||||
)
|
||||
older_epc_data = {k: v for k, v in older_epc_data.items() if k not in internal_id_epcs_to_drop}
|
||||
|
||||
# This is the first ordnance survey data pull
|
||||
os_most_relevant_1 = []
|
||||
os_all_1 = {}
|
||||
for i in tqdm(["1", "2", "3"]):
|
||||
most_relevant_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_most_relevant_{i}.json"
|
||||
)
|
||||
os_most_relevant_1.extend(json.loads(most_relevant_segment))
|
||||
os_all_segment = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name=f"customers/Stonewater/clustering/os_all_{i}.json"
|
||||
)
|
||||
os_all_1 = {**os_all_1, **json.loads(os_all_segment)}
|
||||
|
||||
os_most_relevant_1 = pd.DataFrame(os_most_relevant_1)
|
||||
|
||||
# This is the second ordnance survey data pull
|
||||
os_most_relevant_2 = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os.json"
|
||||
)
|
||||
os_most_relevant_2 = json.loads(os_most_relevant_2)
|
||||
os_most_relevant_2 = pd.DataFrame(os_most_relevant_2)
|
||||
|
||||
os_all_2 = read_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/problematic_os_all.json"
|
||||
)
|
||||
os_all_2 = json.loads(os_all_2)
|
||||
|
||||
########################################################################
|
||||
# Prepare asset list
|
||||
|
|
@ -664,3 +751,176 @@ def compile_data():
|
|||
|
||||
if pd.isnull(asset_list["full_address"]).sum():
|
||||
raise ValueError("Missing full addresses")
|
||||
|
||||
# Quick check to see if we have os data for every property that doesn't have an EPC
|
||||
without_epc = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
|
||||
os_most_relevant_1_internal_ids = os_most_relevant_1["internal_id"].tolist()
|
||||
os_most_relevant_2_internal_ids = os_most_relevant_2["internal_id"].tolist()
|
||||
|
||||
missing_os_data = []
|
||||
for _, x in without_epc.iterrows():
|
||||
# We would prioritise the data pulled the second time around
|
||||
|
||||
internal_id = x["internal_id"]
|
||||
if internal_id in os_most_relevant_2_internal_ids:
|
||||
continue
|
||||
|
||||
if internal_id in os_most_relevant_1_internal_ids:
|
||||
continue
|
||||
|
||||
missing_os_data.append(internal_id)
|
||||
|
||||
if len(missing_os_data):
|
||||
raise Exception("We don't have SOME data for each internal_id")
|
||||
|
||||
# For the EPC data, some of them are missing UPRN
|
||||
epc_data_to_address = asset_list[
|
||||
asset_list["internal_id"].isin(epc_data["internal_id"].values)
|
||||
][
|
||||
["full_address", "internal_id"]].merge(
|
||||
epc_data, how="left", on="internal_id"
|
||||
)
|
||||
missed_uprn = epc_data_to_address[epc_data_to_address["uprn"] == ""]
|
||||
|
||||
# Once we have UPRNs, we might want to pull in the EPC data again
|
||||
# epc_data_with_uprn = []
|
||||
# older_epc_data_with_uprn = {}
|
||||
#
|
||||
# for row_number, asset in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
||||
# searcher = SearchEpc(
|
||||
# address1=str(asset["address1"]),
|
||||
# postcode=str(asset["postcode"]),
|
||||
# auth_token=EPC_AUTH_TOKEN,
|
||||
# os_api_key="",
|
||||
# full_address=str(asset["full_address"]),
|
||||
# uprn=asset["uprn"]
|
||||
# )
|
||||
# searcher.find_property(skip_os=True)
|
||||
#
|
||||
# if searcher.newest_epc is None:
|
||||
# continue
|
||||
#
|
||||
# epc_data_with_uprn.append(
|
||||
# {
|
||||
# "internal_id": asset["internal_id"],
|
||||
# **searcher.newest_epc
|
||||
# }
|
||||
# )
|
||||
#
|
||||
# if searcher.older_epcs is not None:
|
||||
# older_epc_data_with_uprn[asset["internal_id"]] = searcher.older_epcs
|
||||
|
||||
# We now get the remaining properties
|
||||
# TODO: We might want to use epc_data_with_uprn
|
||||
remaining_properties = asset_list[~asset_list["internal_id"].isin(epc_data["internal_id"].values)]
|
||||
|
||||
# We estimate the data
|
||||
final_epcs = []
|
||||
for _, p in remaining_properties.iterrows():
|
||||
internal_id = p["internal_id"]
|
||||
uprn = p["UPRN"]
|
||||
|
||||
if internal_id in os_most_relevant_1_internal_ids:
|
||||
p_os_data = os_most_relevant_1[os_most_relevant_1["internal_id"] == internal_id].to_dict("records")[0]
|
||||
p_os_full = os_all_1[str(internal_id)]
|
||||
else:
|
||||
p_os_data = os_most_relevant_2[os_most_relevant_2["internal_id"] == internal_id].to_dict("records")[0]
|
||||
p_os_full = os_all_2[str(internal_id)]
|
||||
p_os_full = pd.DataFrame(
|
||||
[x["DPA"] if "DPA" in x else x["LPI"] for x in p_os_full]
|
||||
)
|
||||
|
||||
# TODO: Add this back in
|
||||
# When we have this
|
||||
if p["uprn"] != p_os_data["UPRN"]:
|
||||
# Get it from the older data
|
||||
filtered = p_os_full[p_os_full["UPRN"] == p["uprn"]]
|
||||
p_os_data = filtered.to_dict("records")[0]
|
||||
|
||||
searcher = SearchEpc(
|
||||
address1=str(p["address1"]),
|
||||
postcode=str(p["postcode"]),
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key="",
|
||||
uprn=uprn
|
||||
)
|
||||
searcher.ordnance_survey_client.parse_classification_code(p_os_data["CLASSIFICATION_CODE"])
|
||||
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
final_epcs.append(
|
||||
{
|
||||
"internal_id": internal_id,
|
||||
**searcher.newest_epc
|
||||
}
|
||||
)
|
||||
|
||||
final_epcs = pd.DataFrame(final_epcs)
|
||||
|
||||
complete_epcs = pd.concat(
|
||||
[
|
||||
epc_data,
|
||||
final_epcs
|
||||
]
|
||||
)
|
||||
|
||||
# We now pull additional data
|
||||
uprns = complete_epcs["uprn"].tolist()
|
||||
# We get the spatial file list and loop through each EPC and determine which file it needs.
|
||||
# We then just read in the files that we need and get the data, for each uprn from that file
|
||||
|
||||
uprn_filenames = read_dataframe_from_s3_parquet(
|
||||
bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet"
|
||||
)
|
||||
|
||||
uprn_lookup = {}
|
||||
for uprn in complete_epcs["uprn"]:
|
||||
if not uprn:
|
||||
# TODO: Do something about this!
|
||||
continue
|
||||
filtered_df = uprn_filenames[
|
||||
(uprn_filenames["lower"] <= int(uprn))
|
||||
& (uprn_filenames["upper"] >= int(uprn))
|
||||
]
|
||||
if filtered_df["filenames"].values[0] in uprn_lookup:
|
||||
uprn_lookup[filtered_df["filenames"].values[0]].append(int(uprn))
|
||||
else:
|
||||
uprn_lookup[filtered_df["filenames"].values[0]] = [int(uprn)]
|
||||
|
||||
spatial_data_to_uprn = []
|
||||
for filename, associated_uprn in tqdm(uprn_lookup.items(), total=len(uprn_lookup)):
|
||||
# Read in the file
|
||||
spatial_data = read_dataframe_from_s3_parquet(
|
||||
bucket_name="retrofit-data-dev", file_key=f"spatial/{filename}"
|
||||
)
|
||||
|
||||
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
|
||||
spatial_data_to_uprn.append(spatial_df)
|
||||
|
||||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
|
||||
# TODO: Let's store this in s3
|
||||
save_data_to_s3(
|
||||
data=json.dumps(spatial_data_to_uprn.to_dict("records")),
|
||||
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
# We merge this spatial data onto final EPCS
|
||||
spatial_data_to_uprn = spatial_data_to_uprn.drop(
|
||||
columns=["partition", "filename"]
|
||||
).rename(columns={"UPRN": "uprn"})
|
||||
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
|
||||
|
||||
property_attributes = complete_epcs.merge(
|
||||
spatial_data_to_uprn,
|
||||
how="left",
|
||||
on="uprn"
|
||||
)
|
||||
|
||||
# We drop the columns we don't care about for clustering
|
||||
property_attributes = property_attributes.drop(
|
||||
columns=[
|
||||
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ class RoofAttributes(Definitions):
|
|||
"""
|
||||
|
||||
self.description: str = description.lower().strip()
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or self.description == "sap05:roof"
|
||||
|
||||
self.welsh_translation_search()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue