mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
AIHA data review WIP
This commit is contained in:
parent
ceb34979e4
commit
15f55c021f
2 changed files with 454 additions and 1 deletions
|
|
@ -7,6 +7,9 @@ import pandas as pd
|
|||
import numpy as np
|
||||
from epc_api.client import EpcClient
|
||||
from backend.OrdnanceSurvey import OrdnanceSuveyClient
|
||||
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||
from BaseUtility import Definitions
|
||||
from utils.logger import setup_logger
|
||||
from typing import List
|
||||
|
|
@ -181,6 +184,7 @@ class SearchEpc:
|
|||
self.newest_epc = None
|
||||
self.older_epcs = None
|
||||
self.full_sap_epc = None
|
||||
self.metadata = None
|
||||
|
||||
# These are the address and postcode values, which we store in the database
|
||||
self.address_clean = None
|
||||
|
|
@ -306,7 +310,10 @@ class SearchEpc:
|
|||
if (property_type is None) and (address is None):
|
||||
return rows
|
||||
|
||||
if len(uprns) == 1:
|
||||
unique_property_types = {r["property-type"] for r in rows}
|
||||
|
||||
# We allow for variation in property type across flats/maisonettes
|
||||
if (len(uprns) == 1) and ((len(unique_property_types) == 1) or unique_property_types == {"Flat", "Maisonette"}):
|
||||
return rows
|
||||
|
||||
if property_type is not None:
|
||||
|
|
@ -784,3 +791,86 @@ class SearchEpc:
|
|||
self.address_clean = self.ordnance_survey_client.address_os
|
||||
self.postcode_clean = self.ordnance_survey_client.postcode_os
|
||||
return
|
||||
|
||||
def check_attribute_variations(self):
|
||||
attribute_map = {
|
||||
"walls-description": {
|
||||
"cleaner": WallAttributes,
|
||||
"attribute": [
|
||||
"is_cavity_wall", "is_solid_brick", "is_system_built", "is_timber_frame",
|
||||
"is_granite_or_whinstone", "is_cob", "is_sandstone_or_limestone", "is_park_home"
|
||||
],
|
||||
"name": "has_wall_type_ever_varied"
|
||||
},
|
||||
"roof-description": {
|
||||
"cleaner": RoofAttributes,
|
||||
"attribute": [
|
||||
"is_flat", "is_pitched", "is_roof_room", "is_thatched", "has_dwelling_above"
|
||||
],
|
||||
"name": "has_roof_type_ever_varied"
|
||||
},
|
||||
"floor-description": {
|
||||
"cleaner": FloorAttributes,
|
||||
"attribute": [
|
||||
"is_to_unheated_space", "is_to_external_air", "is_suspended", "is_solid", "is_to_external_air",
|
||||
],
|
||||
"name": "has_floor_type_ever_varied"
|
||||
}
|
||||
}
|
||||
|
||||
attribute_variations = {}
|
||||
for attribute, attribute_objs in attribute_map.items():
|
||||
attribute_variations[attribute_objs["name"]] = False
|
||||
cleaner = attribute_objs["cleaner"]
|
||||
type_timeline = pd.DataFrame([cleaner(epc[attribute]).process() for epc in self.older_epcs] + [
|
||||
cleaner(self.newest_epc[attribute]).process()
|
||||
])
|
||||
# For eac col in attribute_objs["attribute"] we check if the timeline has ever varied, i.e has gone
|
||||
# from true to false
|
||||
for col in attribute_objs["attribute"]:
|
||||
if type_timeline[col].nunique() > 1:
|
||||
attribute_variations[attribute_objs["name"]] = True
|
||||
break
|
||||
|
||||
return attribute_variations
|
||||
|
||||
def identify_flat_floor(self):
|
||||
# If there is no dwelling above, it is a top floor flat
|
||||
processed_roof = RoofAttributes(self.newest_epc["roof-description"]).process()
|
||||
if not processed_roof["has_dwelling_above"]:
|
||||
return "top"
|
||||
|
||||
# We know that there is a dwelling above. If there's also a drwelling below, it is a mid floor flat
|
||||
processed_floor = FloorAttributes(self.newest_epc["floor-description"]).process()
|
||||
if processed_floor["another_property_below"]:
|
||||
return "mid"
|
||||
|
||||
# Otherwise ground floor
|
||||
return "ground"
|
||||
|
||||
def get_metadata(self):
|
||||
if self.newest_epc is None:
|
||||
raise ValueError("No EPC data available")
|
||||
|
||||
# We check if the property has ever been downgraded on SAP
|
||||
has_sap_ever_downgraded = False
|
||||
sap_timeline = [int(epc["current-energy-efficiency"]) for epc in self.older_epcs] + [
|
||||
int(self.newest_epc["current-energy-efficiency"])
|
||||
]
|
||||
# We check if there has ever been a decrease by differencing
|
||||
has_sap_ever_downgraded = any(np.diff(sap_timeline) < 0)
|
||||
|
||||
# We check if the wall type has ever varied over time
|
||||
attribute_varations = self.check_attribute_variations()
|
||||
|
||||
# If the property is a flat, we distinguish between top, mid, ground floor
|
||||
floor = None
|
||||
if self.newest_epc["property-type"] == "Flat":
|
||||
floor = self.identify_flat_floor()
|
||||
|
||||
self.metadata = {
|
||||
"days_since_last_epc": (pd.Timestamp.now() - pd.Timestamp(self.newest_epc["lodgement-date"])).days,
|
||||
"has_sap_ever_downgraded": has_sap_ever_downgraded,
|
||||
"floor": floor,
|
||||
**attribute_varations
|
||||
}
|
||||
|
|
|
|||
363
etl/customers/aiha/epc_data_pull.py
Normal file
363
etl/customers/aiha/epc_data_pull.py
Normal file
|
|
@ -0,0 +1,363 @@
|
|||
import os
|
||||
from tqdm import tqdm
|
||||
from dotenv import load_dotenv
|
||||
import pandas as pd
|
||||
from backend.SearchEpc import SearchEpc
|
||||
from etl.spatial.OpenUprnClient import OpenUprnClient
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
||||
pd.set_option('display.max_rows', 500)
|
||||
pd.set_option('display.max_columns', 500)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
||||
|
||||
def app():
|
||||
# Retrieve EPC data for the SHDF AIHA portfolio
|
||||
|
||||
data = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Khalim Review - 240902 - KSQ - AIHA - SHDF Wave "
|
||||
"3 bid - Supplementary information.xlsx",
|
||||
sheet_name="All units information",
|
||||
header=3
|
||||
)
|
||||
|
||||
# Remove the .eg row
|
||||
data = data.tail(-1)
|
||||
|
||||
# Remove the bottom 2 rows
|
||||
data = data.head(-2)
|
||||
data = data.reset_index(drop=True)
|
||||
data["row_id"] = data.index
|
||||
|
||||
ammendments = {
|
||||
"12 11-18 Schonfeld Square": "12 Schonfeld Square",
|
||||
"35 35-37 Schonfeld Square": "35 Schonfeld Square",
|
||||
'77 Schonfeld Square': '77 Lordship Road',
|
||||
"83 Lordship Road (Schonfeld Square)": "83 Lordship Road",
|
||||
"A 80 Bethune Road": "80A Bethune Road",
|
||||
"86B Bethune Road": "Flat B, 86 Bethune Road",
|
||||
"22 Glendale Road": "22 Glendale Avenue",
|
||||
"121 Southbourne Road": "121 Southbourne Grove",
|
||||
}
|
||||
|
||||
no_epc = [
|
||||
"80B Bethune Road",
|
||||
"89B Manor Road",
|
||||
"12 Monkville Avenue",
|
||||
"9 Greenview",
|
||||
]
|
||||
|
||||
property_type_map = {
|
||||
"House, mid-terrace": "House",
|
||||
"House, end terrace": "House",
|
||||
"House, semi-detached": "House",
|
||||
"House, detached": "House",
|
||||
"Flat": "Flat",
|
||||
}
|
||||
|
||||
epc_data = []
|
||||
epc_metadata = []
|
||||
for _, home in tqdm(data.iterrows(), total=len(data)):
|
||||
|
||||
# Build address 1 based on if there is:
|
||||
# 1) Address letter or number
|
||||
# 2) Street address
|
||||
|
||||
modified = False
|
||||
address1 = ""
|
||||
address1_backup = ""
|
||||
|
||||
if home["Address letter or number"] in ["A", "B", "C"]:
|
||||
|
||||
house_no = home['Street address'].split(' ')[0]
|
||||
street = ' '.join(home['Street address'].split(' ')[1:])
|
||||
address1 = f"{house_no}{home['Address letter or number']} {street}"
|
||||
|
||||
address1_backup = f"Flat {home['Address letter or number']} {house_no} {street}"
|
||||
modified = True
|
||||
|
||||
else:
|
||||
if not pd.isnull(home["Address letter or number"]):
|
||||
address1 += f"{home['Address letter or number']} "
|
||||
if not pd.isnull(home["Street address"]):
|
||||
address1 += f"{home['Street address']}"
|
||||
address1 = address1.strip()
|
||||
|
||||
if address1.split(" ")[-1].lower() == "rd":
|
||||
# Replace with road
|
||||
address1 = address1.lower().replace(" rd", " road")
|
||||
|
||||
# Specific ammendments
|
||||
if address1 in ammendments:
|
||||
address1 = ammendments[address1]
|
||||
|
||||
if address1 in no_epc:
|
||||
continue
|
||||
|
||||
searcher = SearchEpc(
|
||||
address1=address1,
|
||||
postcode=home["Postcode"],
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key="",
|
||||
property_type=property_type_map[home["Property type"]]
|
||||
)
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
if searcher.newest_epc is None and modified:
|
||||
searcher = SearchEpc(
|
||||
address1=address1_backup,
|
||||
postcode=home["Postcode"],
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key="",
|
||||
property_type=property_type_map[home["Property type"]]
|
||||
)
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
if searcher.newest_epc is None:
|
||||
raise Exception("Not found")
|
||||
|
||||
epc_data.append(
|
||||
{
|
||||
"row_id": home["row_id"],
|
||||
**searcher.newest_epc
|
||||
}
|
||||
)
|
||||
|
||||
searcher.get_metadata()
|
||||
|
||||
epc_metadata.append(
|
||||
{
|
||||
"row_id": home["row_id"],
|
||||
"address": address1,
|
||||
"postcode": home["Postcode"],
|
||||
**searcher.metadata
|
||||
}
|
||||
)
|
||||
|
||||
epc_metadata = pd.DataFrame(epc_metadata)
|
||||
epc_data = pd.DataFrame(epc_data)
|
||||
|
||||
# Check matched addresses
|
||||
matched_addresses = epc_metadata[["row_id", "address", "postcode"]].copy()
|
||||
matched_addresses = matched_addresses.merge(
|
||||
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
||||
)
|
||||
|
||||
# We look for differences between the asset list and the EPC data
|
||||
comparison_cols = {
|
||||
"Property type": [
|
||||
{
|
||||
"epc_col": "property-type",
|
||||
"map": property_type_map
|
||||
},
|
||||
{
|
||||
"epc_col": "built-form",
|
||||
"map": {
|
||||
"House, mid-terrace": "Mid-Terrace",
|
||||
"House, end terrace": "End-Terrace",
|
||||
"House, semi-detached": "Semi-Detached",
|
||||
"House, detached": "Detached",
|
||||
"Flat": "Flat",
|
||||
}
|
||||
}
|
||||
],
|
||||
"Energy starting band (EPC)": [
|
||||
{
|
||||
"epc_col": "current-energy-rating",
|
||||
"map": {}
|
||||
}
|
||||
],
|
||||
"Wall type": [
|
||||
{
|
||||
"epc_col": "walls-description",
|
||||
"search_terms": {
|
||||
"solid": "Solid brick",
|
||||
"cavity": "Cavity wall",
|
||||
"solid - internal lining": "Solid brick",
|
||||
}
|
||||
}
|
||||
],
|
||||
"Roof type": [
|
||||
{
|
||||
"epc_col": "roof-description",
|
||||
"search_terms": {
|
||||
"pitched": "Pitched",
|
||||
"n/a - (flat above)": "another dwelling above"
|
||||
}
|
||||
}
|
||||
],
|
||||
"Floor type": [
|
||||
{
|
||||
"epc_col": "floor-description",
|
||||
"search_terms": {
|
||||
"solid": "Solid",
|
||||
"suspended": "Suspended",
|
||||
"solid - floating floor for services": "Solid"
|
||||
}
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
import re
|
||||
differences = []
|
||||
for asset_list_col, list_of_configs in comparison_cols.items():
|
||||
|
||||
if asset_list_col in ["Wall type", "Roof type", "Floor type"]:
|
||||
config = list_of_configs[0]
|
||||
# We handle this differently
|
||||
remapped = data[["row_id", asset_list_col]].copy()
|
||||
# Strip the asset list col incase of leading/trailing spaces
|
||||
remapped[asset_list_col] = remapped[asset_list_col].str.strip()
|
||||
remapped[asset_list_col] = remapped[asset_list_col].str.lower()
|
||||
remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner")
|
||||
# We do a search term check
|
||||
remapped["Match"] = None
|
||||
for search_term, epc_term in config["search_terms"].items():
|
||||
if "/" in search_term:
|
||||
escaped_search_term = re.escape(search_term)
|
||||
remapped.loc[remapped[asset_list_col].str.contains(escaped_search_term), "Match"] = (
|
||||
remapped.loc[
|
||||
remapped[asset_list_col].str.contains(escaped_search_term), config["epc_col"]
|
||||
].str.contains(epc_term)
|
||||
)
|
||||
else:
|
||||
remapped.loc[remapped[asset_list_col].str.contains(search_term), "Match"] = (
|
||||
remapped.loc[
|
||||
remapped[asset_list_col].str.contains(search_term), config["epc_col"]
|
||||
].str.contains(epc_term)
|
||||
)
|
||||
|
||||
if pd.isnull(remapped["Match"]).sum():
|
||||
raise Exception("Not all matched")
|
||||
|
||||
remapped["Match"] = remapped["Match"].astype(bool)
|
||||
|
||||
if not all(remapped["Match"]):
|
||||
differences.append(
|
||||
{
|
||||
"Column": asset_list_col,
|
||||
"Differences": remapped[~remapped["Match"]],
|
||||
}
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
for config in list_of_configs:
|
||||
|
||||
remapped = data[["row_id", asset_list_col]].copy()
|
||||
if config["map"]:
|
||||
remapped[asset_list_col] = remapped[asset_list_col].map(config["map"])
|
||||
|
||||
# Merge on
|
||||
remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner")
|
||||
remapped["Match"] = remapped[asset_list_col] == remapped[config["epc_col"]]
|
||||
if not all(remapped["Match"]):
|
||||
differences.append(
|
||||
{
|
||||
"Column": asset_list_col,
|
||||
"Differences": remapped[~remapped["Match"]],
|
||||
}
|
||||
)
|
||||
|
||||
# Check for property type
|
||||
property_type_differences = differences[0]["Differences"].copy()
|
||||
property_type_differences = property_type_differences.merge(
|
||||
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
||||
)
|
||||
print(property_type_differences)
|
||||
|
||||
# Check for built form
|
||||
built_form_differences = differences[1]["Differences"].copy()
|
||||
built_form_differences = built_form_differences[built_form_differences["Property type"] != "Flat"]
|
||||
built_form_differences = built_form_differences.merge(
|
||||
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
||||
)
|
||||
print(built_form_differences)
|
||||
|
||||
# Check for energy rating
|
||||
energy_rating_differences = differences[2]["Differences"].copy()
|
||||
energy_rating_differences = energy_rating_differences.merge(
|
||||
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
||||
).merge(
|
||||
epc_data[["row_id", "uprn"]], on="row_id", how="inner"
|
||||
)
|
||||
print(energy_rating_differences)
|
||||
|
||||
# Check for wall type
|
||||
wall_type_differences = differences[3]["Differences"].copy()
|
||||
wall_type_differences = wall_type_differences.merge(
|
||||
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
||||
).merge(
|
||||
epc_data[["row_id", "uprn"]], on="row_id", how="inner"
|
||||
)
|
||||
print(wall_type_differences) # Many wall type differences
|
||||
|
||||
# Check for roof type
|
||||
roof_type_differences = differences[4]["Differences"].copy()
|
||||
roof_type_differences = roof_type_differences.merge(
|
||||
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
||||
).merge(
|
||||
epc_data[["row_id", "uprn"]], on="row_id", how="inner"
|
||||
)
|
||||
print(roof_type_differences) # Many roof type differences
|
||||
|
||||
# Check for floor type
|
||||
floor_type_differences = differences[5]["Differences"].copy()
|
||||
floor_type_differences = floor_type_differences.merge(
|
||||
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
||||
).merge(
|
||||
epc_data[["row_id", "uprn"]], on="row_id", how="inner"
|
||||
)
|
||||
print(floor_type_differences) # Many floor type differences
|
||||
|
||||
# TODO: 47 Ashtead Road [100021024699] shows solid brick wall on EPC - is probably cavity wall
|
||||
|
||||
# We have the EPC data. Let's check conservation area/historic/listed building status
|
||||
portfolio_spatial_data = OpenUprnClient.get_spatial_data(
|
||||
epc_data["uprn"].unique().tolist(), bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str)
|
||||
|
||||
spatial_data = data[["row_id", "Planning constraints"]].merge(
|
||||
epc_data[["row_id", "uprn"]], on="row_id", how="left",
|
||||
|
||||
).merge(
|
||||
portfolio_spatial_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]],
|
||||
left_on="uprn",
|
||||
right_on="UPRN", how="left"
|
||||
)
|
||||
|
||||
spatial_data[
|
||||
(spatial_data["Planning constraints"] == "None")
|
||||
]["conservation_status"].value_counts()
|
||||
|
||||
# One property is in a conservation area, that was not picked up in the asset data
|
||||
print(spatial_data[
|
||||
(spatial_data["Planning constraints"] == "None") &
|
||||
(spatial_data["conservation_status"] == True)
|
||||
].merge(
|
||||
data[["row_id", "Address letter or number", "Street address", "Postcode"]], on="row_id", how="left"
|
||||
))
|
||||
|
||||
# All properties match up apart from one where the asset data indicates it's in a conservation area, however
|
||||
# the sparital data indicates it's not. There do not appear to be any listed/heritage buildings in the portfolio
|
||||
|
||||
# Draft archetyping
|
||||
archetyping_data = data[
|
||||
[
|
||||
"row_id",
|
||||
"Energy starting band (EPC)",
|
||||
"Property type",
|
||||
"Property year built",
|
||||
"Gross internal area (sqm)",
|
||||
"Current heating system type",
|
||||
"Wall type",
|
||||
"Floor type",
|
||||
"Roof type",
|
||||
"Window type",
|
||||
"Location (Floor)",
|
||||
]
|
||||
]
|
||||
Loading…
Add table
Reference in a new issue