mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
791 lines
27 KiB
Python
791 lines
27 KiB
Python
import os
|
|
from tqdm import tqdm
|
|
from dotenv import load_dotenv
|
|
import pandas as pd
|
|
import numpy as np
|
|
import msgpack
|
|
from utils.s3 import read_from_s3
|
|
from backend.SearchEpc import SearchEpc
|
|
from etl.spatial.OpenUprnClient import OpenUprnClient
|
|
|
|
load_dotenv(dotenv_path="backend/.env")
|
|
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
|
|
|
pd.set_option('display.max_rows', 500)
|
|
pd.set_option('display.max_columns', 500)
|
|
pd.set_option('display.width', 1000)
|
|
|
|
|
|
def app():
|
|
# Retrieve EPC data for the SHDF AIHA portfolio
|
|
|
|
data = pd.read_excel(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Khalim Review - 240902 - KSQ - AIHA - SHDF Wave "
|
|
"3 bid - Supplementary information.xlsx",
|
|
sheet_name="All units information",
|
|
header=3
|
|
)
|
|
|
|
# Remove the .eg row
|
|
data = data.tail(-1)
|
|
|
|
# Remove the bottom 2 rows
|
|
data = data.head(-2)
|
|
data = data.reset_index(drop=True)
|
|
data["row_id"] = data.index
|
|
|
|
ammendments = {
|
|
"12 11-18 Schonfeld Square": "12 Schonfeld Square",
|
|
"35 35-37 Schonfeld Square": "35 Schonfeld Square",
|
|
'77 Schonfeld Square': '77 Lordship Road',
|
|
"83 Lordship Road (Schonfeld Square)": "83 Lordship Road",
|
|
"A 80 Bethune Road": "80A Bethune Road",
|
|
"86B Bethune Road": "Flat B, 86 Bethune Road",
|
|
"22 Glendale Road": "22 Glendale Avenue",
|
|
"121 Southbourne Road": "121 Southbourne Grove",
|
|
}
|
|
|
|
no_epc = [
|
|
"80B Bethune Road",
|
|
"89B Manor Road",
|
|
"12 Monkville Avenue",
|
|
"9 Greenview",
|
|
]
|
|
|
|
property_type_map = {
|
|
"House, mid-terrace": "House",
|
|
"House, end terrace": "House",
|
|
"House, semi-detached": "House",
|
|
"House, detached": "House",
|
|
"Flat": "Flat",
|
|
}
|
|
|
|
epc_data = []
|
|
epc_metadata = []
|
|
for _, home in tqdm(data.iterrows(), total=len(data)):
|
|
|
|
# Build address 1 based on if there is:
|
|
# 1) Address letter or number
|
|
# 2) Street address
|
|
|
|
modified = False
|
|
address1 = ""
|
|
address1_backup = ""
|
|
|
|
if home["Address letter or number"] in ["A", "B", "C"]:
|
|
|
|
house_no = home['Street address'].split(' ')[0]
|
|
street = ' '.join(home['Street address'].split(' ')[1:])
|
|
address1 = f"{house_no}{home['Address letter or number']} {street}"
|
|
|
|
address1_backup = f"Flat {home['Address letter or number']} {house_no} {street}"
|
|
modified = True
|
|
|
|
else:
|
|
if not pd.isnull(home["Address letter or number"]):
|
|
address1 += f"{home['Address letter or number']} "
|
|
if not pd.isnull(home["Street address"]):
|
|
address1 += f"{home['Street address']}"
|
|
address1 = address1.strip()
|
|
|
|
if address1.split(" ")[-1].lower() == "rd":
|
|
# Replace with road
|
|
address1 = address1.lower().replace(" rd", " road")
|
|
|
|
# Specific ammendments
|
|
if address1 in ammendments:
|
|
address1 = ammendments[address1]
|
|
|
|
if address1 in no_epc:
|
|
continue
|
|
|
|
searcher = SearchEpc(
|
|
address1=address1,
|
|
postcode=home["Postcode"],
|
|
auth_token=EPC_AUTH_TOKEN,
|
|
os_api_key="",
|
|
property_type=property_type_map[home["Property type"]]
|
|
)
|
|
searcher.find_property(skip_os=True)
|
|
|
|
if searcher.newest_epc is None and modified:
|
|
searcher = SearchEpc(
|
|
address1=address1_backup,
|
|
postcode=home["Postcode"],
|
|
auth_token=EPC_AUTH_TOKEN,
|
|
os_api_key="",
|
|
property_type=property_type_map[home["Property type"]]
|
|
)
|
|
searcher.find_property(skip_os=True)
|
|
|
|
if searcher.newest_epc is None:
|
|
raise Exception("Not found")
|
|
|
|
epc_data.append(
|
|
{
|
|
"row_id": home["row_id"],
|
|
**searcher.newest_epc
|
|
}
|
|
)
|
|
|
|
searcher.get_metadata()
|
|
|
|
epc_metadata.append(
|
|
{
|
|
"row_id": home["row_id"],
|
|
"address": address1,
|
|
"postcode": home["Postcode"],
|
|
**searcher.metadata
|
|
}
|
|
)
|
|
|
|
epc_metadata = pd.DataFrame(epc_metadata)
|
|
epc_data = pd.DataFrame(epc_data)
|
|
|
|
# Check matched addresses
|
|
matched_addresses = epc_metadata[["row_id", "address", "postcode"]].copy()
|
|
matched_addresses = matched_addresses.merge(
|
|
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
|
)
|
|
|
|
# We look for differences between the asset list and the EPC data
|
|
comparison_cols = {
|
|
"Property type": [
|
|
{
|
|
"epc_col": "property-type",
|
|
"map": property_type_map
|
|
},
|
|
{
|
|
"epc_col": "built-form",
|
|
"map": {
|
|
"House, mid-terrace": "Mid-Terrace",
|
|
"House, end terrace": "End-Terrace",
|
|
"House, semi-detached": "Semi-Detached",
|
|
"House, detached": "Detached",
|
|
"Flat": "Flat",
|
|
}
|
|
}
|
|
],
|
|
"Energy starting band (EPC)": [
|
|
{
|
|
"epc_col": "current-energy-rating",
|
|
"map": {}
|
|
}
|
|
],
|
|
"Wall type": [
|
|
{
|
|
"epc_col": "walls-description",
|
|
"search_terms": {
|
|
"solid": "Solid brick",
|
|
"cavity": "Cavity wall",
|
|
"solid - internal lining": "Solid brick",
|
|
}
|
|
}
|
|
],
|
|
"Roof type": [
|
|
{
|
|
"epc_col": "roof-description",
|
|
"search_terms": {
|
|
"pitched": "Pitched",
|
|
"n/a - (flat above)": "another dwelling above"
|
|
}
|
|
}
|
|
],
|
|
"Floor type": [
|
|
{
|
|
"epc_col": "floor-description",
|
|
"search_terms": {
|
|
"solid": "Solid",
|
|
"suspended": "Suspended",
|
|
"solid - floating floor for services": "Solid"
|
|
}
|
|
}
|
|
],
|
|
}
|
|
|
|
import re
|
|
differences = []
|
|
for asset_list_col, list_of_configs in comparison_cols.items():
|
|
|
|
if asset_list_col in ["Wall type", "Roof type", "Floor type"]:
|
|
config = list_of_configs[0]
|
|
# We handle this differently
|
|
remapped = data[["row_id", asset_list_col]].copy()
|
|
# Strip the asset list col incase of leading/trailing spaces
|
|
remapped[asset_list_col] = remapped[asset_list_col].str.strip()
|
|
remapped[asset_list_col] = remapped[asset_list_col].str.lower()
|
|
remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner")
|
|
# We do a search term check
|
|
remapped["Match"] = None
|
|
for search_term, epc_term in config["search_terms"].items():
|
|
if "/" in search_term:
|
|
escaped_search_term = re.escape(search_term)
|
|
remapped.loc[remapped[asset_list_col].str.contains(escaped_search_term), "Match"] = (
|
|
remapped.loc[
|
|
remapped[asset_list_col].str.contains(escaped_search_term), config["epc_col"]
|
|
].str.contains(epc_term)
|
|
)
|
|
else:
|
|
remapped.loc[remapped[asset_list_col].str.contains(search_term), "Match"] = (
|
|
remapped.loc[
|
|
remapped[asset_list_col].str.contains(search_term), config["epc_col"]
|
|
].str.contains(epc_term)
|
|
)
|
|
|
|
if pd.isnull(remapped["Match"]).sum():
|
|
raise Exception("Not all matched")
|
|
|
|
remapped["Match"] = remapped["Match"].astype(bool)
|
|
|
|
if not all(remapped["Match"]):
|
|
differences.append(
|
|
{
|
|
"Column": asset_list_col,
|
|
"Differences": remapped[~remapped["Match"]],
|
|
}
|
|
)
|
|
|
|
continue
|
|
|
|
for config in list_of_configs:
|
|
|
|
remapped = data[["row_id", asset_list_col]].copy()
|
|
if config["map"]:
|
|
remapped[asset_list_col] = remapped[asset_list_col].map(config["map"])
|
|
|
|
# Merge on
|
|
remapped = remapped.merge(epc_data[["row_id", config["epc_col"]]], on="row_id", how="inner")
|
|
remapped["Match"] = remapped[asset_list_col] == remapped[config["epc_col"]]
|
|
if not all(remapped["Match"]):
|
|
differences.append(
|
|
{
|
|
"Column": asset_list_col,
|
|
"Differences": remapped[~remapped["Match"]],
|
|
}
|
|
)
|
|
|
|
# Check for property type
|
|
property_type_differences = differences[0]["Differences"].copy()
|
|
property_type_differences = property_type_differences.merge(
|
|
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
|
)
|
|
print(property_type_differences)
|
|
|
|
# Check for built form
|
|
built_form_differences = differences[1]["Differences"].copy()
|
|
built_form_differences = built_form_differences[built_form_differences["Property type"] != "Flat"]
|
|
built_form_differences = built_form_differences.merge(
|
|
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
|
)
|
|
print(built_form_differences)
|
|
|
|
# Check for energy rating
|
|
energy_rating_differences = differences[2]["Differences"].copy()
|
|
energy_rating_differences = energy_rating_differences.merge(
|
|
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
|
).merge(
|
|
epc_data[["row_id", "uprn"]], on="row_id", how="inner"
|
|
)
|
|
print(energy_rating_differences)
|
|
|
|
# Check for wall type
|
|
wall_type_differences = differences[3]["Differences"].copy()
|
|
wall_type_differences = wall_type_differences.merge(
|
|
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
|
).merge(
|
|
epc_data[["row_id", "uprn"]], on="row_id", how="inner"
|
|
)
|
|
print(wall_type_differences) # Many wall type differences
|
|
|
|
# Check for roof type
|
|
roof_type_differences = differences[4]["Differences"].copy()
|
|
roof_type_differences = roof_type_differences.merge(
|
|
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
|
).merge(
|
|
epc_data[["row_id", "uprn"]], on="row_id", how="inner"
|
|
)
|
|
print(roof_type_differences) # Many roof type differences
|
|
|
|
# Check for floor type
|
|
floor_type_differences = differences[5]["Differences"].copy()
|
|
floor_type_differences = floor_type_differences.merge(
|
|
data[["row_id", "Address letter or number", "Street address"]], on="row_id", how="inner"
|
|
).merge(
|
|
epc_data[["row_id", "uprn"]], on="row_id", how="inner"
|
|
)
|
|
print(floor_type_differences) # Many floor type differences
|
|
|
|
# TODO: 47 Ashtead Road [100021024699] shows solid brick wall on EPC - is probably cavity wall
|
|
|
|
# We have the EPC data. Let's check conservation area/historic/listed building status
|
|
portfolio_spatial_data = OpenUprnClient.get_spatial_data(
|
|
epc_data["uprn"].unique().tolist(), bucket_name="retrofit-data-dev"
|
|
)
|
|
|
|
portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str)
|
|
|
|
spatial_data = data[["row_id", "Planning constraints"]].merge(
|
|
epc_data[["row_id", "uprn"]], on="row_id", how="left",
|
|
|
|
).merge(
|
|
portfolio_spatial_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]],
|
|
left_on="uprn",
|
|
right_on="UPRN", how="left"
|
|
)
|
|
|
|
spatial_data[
|
|
(spatial_data["Planning constraints"] == "None")
|
|
]["conservation_status"].value_counts()
|
|
|
|
# One property is in a conservation area, that was not picked up in the asset data
|
|
print(spatial_data[
|
|
(spatial_data["Planning constraints"] == "None") &
|
|
(spatial_data["conservation_status"] == True)
|
|
].merge(
|
|
data[["row_id", "Address letter or number", "Street address", "Postcode"]], on="row_id", how="left"
|
|
))
|
|
|
|
# All properties match up apart from one where the asset data indicates it's in a conservation area, however
|
|
# the sparital data indicates it's not. There do not appear to be any listed/heritage buildings in the portfolio
|
|
|
|
################################################################
|
|
# Draft archetyping
|
|
################################################################
|
|
|
|
cleaned = read_from_s3(
|
|
s3_file_name="cleaned_epc_data/cleaned.bson",
|
|
bucket_name="retrofit-data-dev"
|
|
)
|
|
cleaned = msgpack.unpackb(cleaned, raw=False)
|
|
|
|
epc_data = epc_data.merge(
|
|
pd.DataFrame(cleaned["walls-description"])[
|
|
['original_description',
|
|
'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
|
'is_as_built', 'is_assumed', 'insulation_thickness']
|
|
|
|
].rename(
|
|
columns={
|
|
"is_solid_brick": "is_solid_brick_wall",
|
|
"is_system_built": "is_system_built_wall",
|
|
"is_timber_frame": "is_timber_frame_wall",
|
|
"is_assumed": "is_assumed_wall",
|
|
"insulation_thickness": "insulation_thickness_wall"
|
|
}
|
|
),
|
|
left_on="walls-description",
|
|
right_on="original_description"
|
|
).merge(
|
|
pd.DataFrame(cleaned["roof-description"])[
|
|
[
|
|
'original_description', 'is_pitched', 'is_roof_room', 'is_loft',
|
|
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
|
|
'has_dwelling_above', 'insulation_thickness'
|
|
]
|
|
].rename(
|
|
columns={
|
|
"is_assumed": "is_assumed_roof",
|
|
}
|
|
),
|
|
left_on="roof-description",
|
|
right_on="original_description"
|
|
).merge(
|
|
pd.DataFrame(cleaned["floor-description"])[
|
|
[
|
|
'original_description', 'is_solid', 'is_suspended', 'is_assumed',
|
|
'insulation_thickness'
|
|
]
|
|
].rename(
|
|
columns={
|
|
"is_assumed": "is_assumed_floor",
|
|
"insulation_thickness": "insulation_thickness_floor"
|
|
}
|
|
),
|
|
left_on="floor-description",
|
|
right_on="original_description"
|
|
)
|
|
|
|
archetyping_data = data[
|
|
[
|
|
"row_id",
|
|
"Energy starting band (EPC)",
|
|
"Property type",
|
|
"Property year built",
|
|
"Gross internal area (sqm)",
|
|
"Current heating system type",
|
|
"Wall type",
|
|
"Floor type",
|
|
"Roof type",
|
|
"Window type",
|
|
"Location (Floor)",
|
|
]
|
|
].merge(
|
|
epc_metadata[["row_id", "floor"]],
|
|
how="left",
|
|
on="row_id"
|
|
).merge(
|
|
epc_data[
|
|
[
|
|
"row_id", "uprn", "current-energy-rating", "property-type", "built-form", "total-floor-area",
|
|
'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick_wall', 'is_system_built_wall',
|
|
'is_timber_frame_wall', 'is_as_built', 'is_assumed_wall', 'insulation_thickness_wall',
|
|
'is_solid', 'is_suspended', 'is_assumed_floor', 'insulation_thickness_floor',
|
|
'is_pitched', 'is_roof_room', 'is_loft',
|
|
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed_roof',
|
|
'has_dwelling_above', 'insulation_thickness', "mainheat-description",
|
|
"local-authority-label"
|
|
]
|
|
],
|
|
how="left",
|
|
on="row_id"
|
|
).merge(
|
|
spatial_data[["row_id", "conservation_status", ]],
|
|
on="row_id",
|
|
how="left"
|
|
)
|
|
|
|
if archetyping_data.shape[0] != data.shape[0]:
|
|
raise Exception("Mismatch in data")
|
|
|
|
# We create groups analogous to the Energy Company Obligation
|
|
# 0 - 72, 73 - 97, 98 - 199, 200+
|
|
archetyping_data["Floor_area_category"] = pd.cut(
|
|
archetyping_data["Gross internal area (sqm)"],
|
|
bins=[0, 72, 97, 199, 1000],
|
|
labels=["0-72", "73-97", "98-199", "200+"]
|
|
)
|
|
archetyping_data["Floor_area_category_backup"] = pd.cut(
|
|
archetyping_data["total-floor-area"].astype(float),
|
|
bins=[0, 72, 97, 199, 1000],
|
|
labels=["0-72", "73-97", "98-199", "200+"]
|
|
)
|
|
archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].fillna(
|
|
archetyping_data["Floor_area_category_backup"]
|
|
)
|
|
archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].astype(str)
|
|
archetyping_data["Floor_area_category"] = np.where(
|
|
pd.isnull(archetyping_data["Floor_area_category"]),
|
|
"Unknown",
|
|
archetyping_data["Floor_area_category"]
|
|
)
|
|
archetyping_data = archetyping_data.drop(columns=["Floor_area_category_backup"])
|
|
|
|
archetyping_data["property-type-reduced"] = np.where(
|
|
archetyping_data["property-type"].isin(["Flat", "Maisionette"]),
|
|
"Flat/Maisonette",
|
|
archetyping_data["property-type"]
|
|
)
|
|
|
|
archetyping_data["built-form-reduced"] = np.where(
|
|
archetyping_data["built-form"].isin(["End-Terrace", "Semi-Detached"]),
|
|
"End-Terrace/Semi-Detached",
|
|
archetyping_data["built-form"]
|
|
)
|
|
archetyping_data["built-form-reduced"] = np.where(
|
|
archetyping_data["property-type-reduced"] == "Flat/Maisonette",
|
|
"Flat/Maisonette",
|
|
archetyping_data["built-form-reduced"]
|
|
)
|
|
|
|
archetyping_data["Wall type"] = np.where(
|
|
archetyping_data["Wall type"].isin(['Solid ', 'Solid - internal lining ']),
|
|
"Solid",
|
|
archetyping_data["Wall type"]
|
|
)
|
|
archetyping_data["Wall type"] = np.where(
|
|
archetyping_data["Wall type"].isin(['Cavity ', 'cavity ']),
|
|
"Cavity",
|
|
archetyping_data["Wall type"]
|
|
)
|
|
|
|
# Proposed remaps based on discoveries
|
|
value_remaps = {
|
|
# 8 Filey Avenue
|
|
"100021040744": {
|
|
"variable": "Property type",
|
|
"newvalue": "House, mid-terrace",
|
|
},
|
|
# 7 Yetev Lev Court
|
|
"100021032043": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Cavity",
|
|
},
|
|
# 14 Yetev Lev Court
|
|
"100021032050": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Cavity",
|
|
},
|
|
# 23 Yetev Lev Court
|
|
"100021032059": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Cavity",
|
|
},
|
|
# 30 Yetev Lev Court
|
|
"100021032066": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Cavity",
|
|
},
|
|
# 34 Yetev Lev Court
|
|
"100021032070": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Cavity",
|
|
},
|
|
# B 86 Bethune Road
|
|
"100021026285": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Solid",
|
|
},
|
|
# A 80 Bethune Road
|
|
"100021026277": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Solid",
|
|
},
|
|
# 140 Kyverdale Road
|
|
"100021052262": {
|
|
"variable": "Property type",
|
|
"newvalue": "House, mid-terrace",
|
|
},
|
|
# 6 Leabourne Road
|
|
"100021053799": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Solid",
|
|
},
|
|
# 22 Britannia Gardens - needs confirmation
|
|
# 7 Satanita Road - needs confirmation
|
|
# 12 Cheltenham Crescent
|
|
"100011402969": {
|
|
"variable": "Wall type",
|
|
"newvalue": "Cavity",
|
|
},
|
|
"100021031752": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
# 79 Craven Park Road
|
|
"100021169682": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
# 88 Darenth Road
|
|
"100021036148": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
"100021036165": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
"100021036167": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
"100021053849": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
"100021054353": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
"100021054560": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
"100021059839": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
},
|
|
"100021059848": {
|
|
"variable": "Roof type",
|
|
"newvalue": "Room Roof"
|
|
}
|
|
}
|
|
|
|
# Perform the remaps
|
|
for uprn, config in value_remaps.items():
|
|
archetyping_data[config["variable"]] = np.where(
|
|
archetyping_data["uprn"].astype(str) == uprn, config["newvalue"], archetyping_data[config["variable"]]
|
|
)
|
|
|
|
# row_id = data[
|
|
# # (data["Address letter or number"] == "C") &
|
|
# (data["Street address"].str.strip() == "41 Moresby Road")
|
|
# ]["row_id"]
|
|
# if len(row_id) != 1:
|
|
# raise Exception("Fail")
|
|
# print(epc_data[epc_data["row_id"] == row_id.values[0]]["uprn"])
|
|
|
|
# Map the year to the age band
|
|
def categorize_year(year):
|
|
if isinstance(year, str):
|
|
# Handle the case where year is in the format '1930s'
|
|
if 's' in year:
|
|
year = int(year[:4])
|
|
else:
|
|
year = int(year)
|
|
else:
|
|
year = int(year)
|
|
|
|
# Categorize based on year ranges
|
|
if year < 1900:
|
|
return 'A'
|
|
elif 1900 <= year <= 1929:
|
|
return 'B'
|
|
elif 1930 <= year <= 1949:
|
|
return 'C'
|
|
elif 1950 <= year <= 1966:
|
|
return 'D'
|
|
elif 1967 <= year <= 1975:
|
|
return 'E'
|
|
elif 1976 <= year <= 1982:
|
|
return 'F'
|
|
elif 1983 <= year <= 1990:
|
|
return 'G'
|
|
elif 1991 <= year <= 1995:
|
|
return 'H'
|
|
elif 1996 <= year <= 2002:
|
|
return 'I'
|
|
elif 2003 <= year <= 2006:
|
|
return 'J'
|
|
elif 2007 <= year <= 2011:
|
|
return 'K'
|
|
else: # year >= 2012
|
|
return 'L'
|
|
|
|
archetyping_data["SAP_age_band"] = archetyping_data["Property year built"].apply(
|
|
categorize_year
|
|
)
|
|
|
|
# Flag if the property is in London/Manchester
|
|
archetyping_data["Location"] = np.where(
|
|
archetyping_data["local-authority-label"].isin(
|
|
["Hackney", "Barnet", "Haringey"]
|
|
),
|
|
"London",
|
|
np.where(
|
|
archetyping_data["local-authority-label"].isin(
|
|
["Salford", "Bury"]
|
|
),
|
|
"Manchester",
|
|
"Southend"
|
|
)
|
|
)
|
|
# 9 Greenview is in manchester
|
|
archetyping_data["Location"] = np.where(
|
|
archetyping_data["row_id"] == data[data["Street address"] == "9 Greenview"]["row_id"].values[0],
|
|
"Manchester",
|
|
archetyping_data["Location"]
|
|
)
|
|
# We fix the location for B 80 Bethune Road
|
|
archetyping_data["Location"] = np.where(
|
|
(
|
|
archetyping_data["row_id"].isin(
|
|
data[
|
|
data["Street address"] == "80 Bethune Road"
|
|
]["row_id"].values.tolist()
|
|
)
|
|
) & (
|
|
archetyping_data["row_id"].isin(
|
|
data[
|
|
data["Address letter or number"] == "B"
|
|
]["row_id"].values.tolist()
|
|
)
|
|
),
|
|
"London",
|
|
archetyping_data["Location"]
|
|
)
|
|
|
|
# Hackney 73 - London
|
|
# Southend-on-Sea 6 - Southend
|
|
# Barnet 4 - London
|
|
# Castle Point 4 - Southend
|
|
# Haringey 3 - London
|
|
# Salford 2 - Manchester
|
|
# Bury 1 - Manchester
|
|
|
|
primary_archetyping_cols = [
|
|
'Property type',
|
|
"Location (Floor)",
|
|
'Current heating system type',
|
|
'Wall type',
|
|
'Roof type',
|
|
# "Location",
|
|
# 'current-energy-rating', 'property-type-reduced', 'built-form-reduced', 'is_cavity_wall',
|
|
# 'is_solid_brick_wall', 'is_system_built_wall', 'is_timber_frame_wall', 'is_as_built',
|
|
# 'is_solid', 'is_roof_room',
|
|
# 'is_loft', 'is_flat', 'is_thatched',
|
|
# 'is_at_rafters', 'has_dwelling_above',
|
|
# 'conservation_status',
|
|
]
|
|
|
|
secondary_cols = [
|
|
'SAP_age_band',
|
|
'is_filled_cavity',
|
|
'insulation_thickness_wall'
|
|
'insulation_thickness_floor'
|
|
'insulation_thickness',
|
|
'is_assumed_wall',
|
|
'is_assumed_roof',
|
|
'Floor_area_category'
|
|
]
|
|
|
|
archetypes = archetyping_data[primary_archetyping_cols].drop_duplicates()
|
|
# Hash the variables
|
|
archetypes["archetype_hash"] = archetypes.apply(
|
|
lambda x: hash(tuple(x.values)),
|
|
axis=1
|
|
)
|
|
archetypes = archetypes.sort_values("archetype_hash", ascending=True)
|
|
archetypes = archetypes.reset_index(drop=True)
|
|
archetypes["archetype_id"] = archetypes.index
|
|
|
|
archetypes.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/basic-archetypes.csv", index=False)
|
|
|
|
# We match properties to archetypes
|
|
archetyping_data = archetyping_data.merge(
|
|
archetypes,
|
|
on=primary_archetyping_cols,
|
|
how="left"
|
|
)
|
|
|
|
# We should choose a representative property for each archetype
|
|
archetyping_data = archetyping_data.merge(
|
|
epc_metadata[["row_id", "days_since_last_epc"]],
|
|
how="left",
|
|
on="row_id"
|
|
)
|
|
|
|
# Mark the property with the oldest EPC as the representative property
|
|
representative_properties = archetyping_data.sort_values(
|
|
["archetype_id", "days_since_last_epc"], ascending=[True, False]
|
|
).drop_duplicates("archetype_id")
|
|
|
|
archetyping_data["for_sample"] = np.where(
|
|
archetyping_data["row_id"].isin(representative_properties["row_id"]),
|
|
True,
|
|
False
|
|
)
|
|
|
|
# We save the archetyping data
|
|
archetyping_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/archetyping_data.csv",
|
|
index=False)
|
|
# Save the EPC data
|
|
epc_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/epc_data.csv", index=False)
|
|
# Save the spatial data
|
|
spatial_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge(
|
|
spatial_data,
|
|
on="row_id",
|
|
how="left"
|
|
)
|
|
spatial_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/spatial_data.csv", index=False)
|
|
|
|
# Save archetyping data
|
|
archetyping_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge(
|
|
archetyping_data,
|
|
on="row_id",
|
|
how="left"
|
|
)
|
|
archetyping_data.to_csv(
|
|
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/archetyping_data.csv",
|
|
index=False
|
|
)
|