Adding to archetyping

This commit is contained in:
Khalim Conn-Kowlessar 2024-09-13 18:05:02 +01:00
parent 15f55c021f
commit 391c6f5cf0

View file

@ -2,6 +2,9 @@ import os
from tqdm import tqdm
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import msgpack
from utils.s3 import read_from_s3
from backend.SearchEpc import SearchEpc
from etl.spatial.OpenUprnClient import OpenUprnClient
@ -345,7 +348,63 @@ def app():
# All properties match up apart from one where the asset data indicates it's in a conservation area, however
# the sparital data indicates it's not. There do not appear to be any listed/heritage buildings in the portfolio
################################################################
# Draft archetyping
################################################################
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
epc_data = epc_data.merge(
pd.DataFrame(cleaned["walls-description"])[
['original_description',
'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_as_built', 'is_assumed', 'insulation_thickness']
].rename(
columns={
"is_solid_brick": "is_solid_brick_wall",
"is_system_built": "is_system_built_wall",
"is_timber_frame": "is_timber_frame_wall",
"is_assumed": "is_assumed_wall",
"insulation_thickness": "insulation_thickness_wall"
}
),
left_on="walls-description",
right_on="original_description"
).merge(
pd.DataFrame(cleaned["roof-description"])[
[
'original_description', 'is_pitched', 'is_roof_room', 'is_loft',
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
'has_dwelling_above', 'insulation_thickness'
]
].rename(
columns={
"is_assumed": "is_assumed_roof",
}
),
left_on="roof-description",
right_on="original_description"
).merge(
pd.DataFrame(cleaned["floor-description"])[
[
'original_description', 'is_solid', 'is_suspended', 'is_assumed',
'insulation_thickness'
]
].rename(
columns={
"is_assumed": "is_assumed_floor",
"insulation_thickness": "insulation_thickness_floor"
}
),
left_on="floor-description",
right_on="original_description"
)
archetyping_data = data[
[
"row_id",
@ -360,4 +419,353 @@ def app():
"Window type",
"Location (Floor)",
]
].merge(
epc_metadata[["row_id", "floor"]],
how="left",
on="row_id"
).merge(
epc_data[
[
"row_id", "uprn", "current-energy-rating", "property-type", "built-form", "total-floor-area",
'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick_wall', 'is_system_built_wall',
'is_timber_frame_wall', 'is_as_built', 'is_assumed_wall', 'insulation_thickness_wall',
'is_solid', 'is_suspended', 'is_assumed_floor', 'insulation_thickness_floor',
'is_pitched', 'is_roof_room', 'is_loft',
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed_roof',
'has_dwelling_above', 'insulation_thickness', "mainheat-description",
"local-authority-label"
]
],
how="left",
on="row_id"
).merge(
spatial_data[["row_id", "conservation_status", ]],
on="row_id",
how="left"
)
if archetyping_data.shape[0] != data.shape[0]:
raise Exception("Mismatch in data")
# We create groups analogous to the Energy Company Obligation
# 0 - 72, 73 - 97, 98 - 199, 200+
archetyping_data["Floor_area_category"] = pd.cut(
archetyping_data["Gross internal area (sqm)"],
bins=[0, 72, 97, 199, 1000],
labels=["0-72", "73-97", "98-199", "200+"]
)
archetyping_data["Floor_area_category_backup"] = pd.cut(
archetyping_data["total-floor-area"].astype(float),
bins=[0, 72, 97, 199, 1000],
labels=["0-72", "73-97", "98-199", "200+"]
)
archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].fillna(
archetyping_data["Floor_area_category_backup"]
)
archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].astype(str)
archetyping_data["Floor_area_category"] = np.where(
pd.isnull(archetyping_data["Floor_area_category"]),
"Unknown",
archetyping_data["Floor_area_category"]
)
archetyping_data = archetyping_data.drop(columns=["Floor_area_category_backup"])
archetyping_data["property-type-reduced"] = np.where(
archetyping_data["property-type"].isin(["Flat", "Maisionette"]),
"Flat/Maisonette",
archetyping_data["property-type"]
)
archetyping_data["built-form-reduced"] = np.where(
archetyping_data["built-form"].isin(["End-Terrace", "Semi-Detached"]),
"End-Terrace/Semi-Detached",
archetyping_data["built-form"]
)
archetyping_data["built-form-reduced"] = np.where(
archetyping_data["property-type-reduced"] == "Flat/Maisonette",
"Flat/Maisonette",
archetyping_data["built-form-reduced"]
)
archetyping_data["Wall type"] = np.where(
archetyping_data["Wall type"].isin(['Solid ', 'Solid - internal lining ']),
"Solid",
archetyping_data["Wall type"]
)
archetyping_data["Wall type"] = np.where(
archetyping_data["Wall type"].isin(['Cavity ', 'cavity ']),
"Cavity",
archetyping_data["Wall type"]
)
# Proposed remaps based on discoveries
value_remaps = {
# 8 Filey Avenue
"100021040744": {
"variable": "Property type",
"newvalue": "House, mid-terrace",
},
# 7 Yetev Lev Court
"100021032043": {
"variable": "Wall type",
"newvalue": "Cavity",
},
# 14 Yetev Lev Court
"100021032050": {
"variable": "Wall type",
"newvalue": "Cavity",
},
# 23 Yetev Lev Court
"100021032059": {
"variable": "Wall type",
"newvalue": "Cavity",
},
# 30 Yetev Lev Court
"100021032066": {
"variable": "Wall type",
"newvalue": "Cavity",
},
# 34 Yetev Lev Court
"100021032070": {
"variable": "Wall type",
"newvalue": "Cavity",
},
# B 86 Bethune Road
"100021026285": {
"variable": "Wall type",
"newvalue": "Solid",
},
# A 80 Bethune Road
"100021026277": {
"variable": "Wall type",
"newvalue": "Solid",
},
# 140 Kyverdale Road
"100021052262": {
"variable": "Property type",
"newvalue": "House, mid-terrace",
},
# 6 Leabourne Road
"100021053799": {
"variable": "Wall type",
"newvalue": "Solid",
},
# 22 Britannia Gardens - needs confirmation
# 7 Satanita Road - needs confirmation
# 12 Cheltenham Crescent
"100011402969": {
"variable": "Wall type",
"newvalue": "Cavity",
},
"100021031752": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
# 79 Craven Park Road
"100021169682": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
# 88 Darenth Road
"100021036148": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
"100021036165": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
"100021036167": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
"100021053849": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
"100021054353": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
"100021054560": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
"100021059839": {
"variable": "Roof type",
"newvalue": "Room Roof"
},
"100021059848": {
"variable": "Roof type",
"newvalue": "Room Roof"
}
}
# Perform the remaps
for uprn, config in value_remaps.items():
archetyping_data[config["variable"]] = np.where(
archetyping_data["uprn"].astype(str) == uprn, config["newvalue"], archetyping_data[config["variable"]]
)
# row_id = data[
# # (data["Address letter or number"] == "C") &
# (data["Street address"].str.strip() == "41 Moresby Road")
# ]["row_id"]
# if len(row_id) != 1:
# raise Exception("Fail")
# print(epc_data[epc_data["row_id"] == row_id.values[0]]["uprn"])
# Map the year to the age band
def categorize_year(year):
if isinstance(year, str):
# Handle the case where year is in the format '1930s'
if 's' in year:
year = int(year[:4])
else:
year = int(year)
else:
year = int(year)
# Categorize based on year ranges
if year < 1900:
return 'A'
elif 1900 <= year <= 1929:
return 'B'
elif 1930 <= year <= 1949:
return 'C'
elif 1950 <= year <= 1966:
return 'D'
elif 1967 <= year <= 1975:
return 'E'
elif 1976 <= year <= 1982:
return 'F'
elif 1983 <= year <= 1990:
return 'G'
elif 1991 <= year <= 1995:
return 'H'
elif 1996 <= year <= 2002:
return 'I'
elif 2003 <= year <= 2006:
return 'J'
elif 2007 <= year <= 2011:
return 'K'
else: # year >= 2012
return 'L'
archetyping_data["SAP_age_band"] = archetyping_data["Property year built"].apply(
categorize_year
)
# Flag if the property is in London/Manchester
archetyping_data["Location"] = np.where(
archetyping_data["local-authority-label"].isin(
["Hackney", "Barnet", "Haringey"]
),
"London",
np.where(
archetyping_data["local-authority-label"].isin(
["Salford", "Bury"]
),
"Manchester",
"Southend"
)
)
# 9 Greenview is in manchester
archetyping_data["Location"] = np.where(
archetyping_data["row_id"] == data[data["Street address"] == "9 Greenview"]["row_id"].values[0],
"Manchester",
archetyping_data["Location"]
)
# Hackney 73 - London
# Southend-on-Sea 6 - Southend
# Barnet 4 - London
# Castle Point 4 - Southend
# Haringey 3 - London
# Salford 2 - Manchester
# Bury 1 - Manchester
primary_archetyping_cols = [
'Property type',
"Location (Floor)",
'Current heating system type',
'Wall type',
'Roof type',
"Location",
# 'current-energy-rating', 'property-type-reduced', 'built-form-reduced', 'is_cavity_wall',
# 'is_solid_brick_wall', 'is_system_built_wall', 'is_timber_frame_wall', 'is_as_built',
# 'is_solid', 'is_roof_room',
# 'is_loft', 'is_flat', 'is_thatched',
# 'is_at_rafters', 'has_dwelling_above',
# 'conservation_status',
]
secondary_cols = [
'SAP_age_band',
'is_filled_cavity',
'insulation_thickness_wall'
'insulation_thickness_floor'
'insulation_thickness',
'is_assumed_wall',
'is_assumed_roof',
'Floor_area_category'
]
archetypes = archetyping_data[primary_archetyping_cols].drop_duplicates()
# Hash the variables
archetypes["archetype_hash"] = archetypes.apply(
lambda x: hash(tuple(x.values)),
axis=1
)
archetypes = archetypes.sort_values("archetype_hash", ascending=True)
archetypes = archetypes.reset_index(drop=True)
archetypes["archetype_id"] = archetypes.index
archetypes.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/basic-archetypes.csv", index=False)
# We match properties to archetypes
archetyping_data = archetyping_data.merge(
archetypes,
on=primary_archetyping_cols,
how="left"
)
# We should choose a representative property for each archetype
archetyping_data = archetyping_data.merge(
epc_metadata[["row_id", "days_since_last_epc"]],
how="left",
on="row_id"
)
# Mark the property with the oldest EPC as the representative property
representative_properties = archetyping_data.sort_values(
["archetype_id", "days_since_last_epc"], ascending=[True, False]
).drop_duplicates("archetype_id")
archetyping_data["for_sample"] = np.where(
archetyping_data["row_id"].isin(representative_properties["row_id"]),
True,
False
)
# We save the archetyping data
archetyping_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/archetyping_data.csv",
index=False)
# Save the EPC data
epc_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/epc_data.csv", index=False)
# Save the spatial data
spatial_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge(
spatial_data,
on="row_id",
how="left"
)
spatial_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/spatial_data.csv", index=False)
# Save archetyping data
archetyping_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge(
archetyping_data,
on="row_id",
how="left"
)
archetyping_data = archetyping_data.drop(columns=["row_id"])