From 391c6f5cf0e07eca36e8d2ecf8c075e475df95b5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 13 Sep 2024 18:05:02 +0100 Subject: [PATCH] Adding to archetyping --- etl/customers/aiha/epc_data_pull.py | 408 ++++++++++++++++++++++++++++ 1 file changed, 408 insertions(+) diff --git a/etl/customers/aiha/epc_data_pull.py b/etl/customers/aiha/epc_data_pull.py index 8aaaf5ba..5e7c6714 100644 --- a/etl/customers/aiha/epc_data_pull.py +++ b/etl/customers/aiha/epc_data_pull.py @@ -2,6 +2,9 @@ import os from tqdm import tqdm from dotenv import load_dotenv import pandas as pd +import numpy as np +import msgpack +from utils.s3 import read_from_s3 from backend.SearchEpc import SearchEpc from etl.spatial.OpenUprnClient import OpenUprnClient @@ -345,7 +348,63 @@ def app(): # All properties match up apart from one where the asset data indicates it's in a conservation area, however # the sparital data indicates it's not. There do not appear to be any listed/heritage buildings in the portfolio + ################################################################ # Draft archetyping + ################################################################ + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + epc_data = epc_data.merge( + pd.DataFrame(cleaned["walls-description"])[ + ['original_description', + 'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame', + 'is_as_built', 'is_assumed', 'insulation_thickness'] + + ].rename( + columns={ + "is_solid_brick": "is_solid_brick_wall", + "is_system_built": "is_system_built_wall", + "is_timber_frame": "is_timber_frame_wall", + "is_assumed": "is_assumed_wall", + "insulation_thickness": "insulation_thickness_wall" + } + ), + left_on="walls-description", + right_on="original_description" + ).merge( + pd.DataFrame(cleaned["roof-description"])[ + [ + 'original_description', 'is_pitched', 'is_roof_room', 'is_loft', + 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed', + 'has_dwelling_above', 'insulation_thickness' + ] + ].rename( + columns={ + "is_assumed": "is_assumed_roof", + } + ), + left_on="roof-description", + right_on="original_description" + ).merge( + pd.DataFrame(cleaned["floor-description"])[ + [ + 'original_description', 'is_solid', 'is_suspended', 'is_assumed', + 'insulation_thickness' + ] + ].rename( + columns={ + "is_assumed": "is_assumed_floor", + "insulation_thickness": "insulation_thickness_floor" + } + ), + left_on="floor-description", + right_on="original_description" + ) + archetyping_data = data[ [ "row_id", @@ -360,4 +419,353 @@ def app(): "Window type", "Location (Floor)", ] + ].merge( + epc_metadata[["row_id", "floor"]], + how="left", + on="row_id" + ).merge( + epc_data[ + [ + "row_id", "uprn", "current-energy-rating", "property-type", "built-form", "total-floor-area", + 'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick_wall', 'is_system_built_wall', + 'is_timber_frame_wall', 'is_as_built', 'is_assumed_wall', 'insulation_thickness_wall', + 'is_solid', 'is_suspended', 'is_assumed_floor', 'insulation_thickness_floor', + 'is_pitched', 'is_roof_room', 'is_loft', + 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed_roof', + 'has_dwelling_above', 'insulation_thickness', "mainheat-description", + "local-authority-label" + ] + ], + how="left", + on="row_id" + ).merge( + spatial_data[["row_id", "conservation_status", ]], + on="row_id", + how="left" + ) + + if archetyping_data.shape[0] != data.shape[0]: + raise Exception("Mismatch in data") + + # We create groups analogous to the Energy Company Obligation + # 0 - 72, 73 - 97, 98 - 199, 200+ + archetyping_data["Floor_area_category"] = pd.cut( + archetyping_data["Gross internal area (sqm)"], + bins=[0, 72, 97, 199, 1000], + labels=["0-72", "73-97", "98-199", "200+"] + ) + archetyping_data["Floor_area_category_backup"] = pd.cut( + archetyping_data["total-floor-area"].astype(float), + bins=[0, 72, 97, 199, 1000], + labels=["0-72", "73-97", "98-199", "200+"] + ) + archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].fillna( + archetyping_data["Floor_area_category_backup"] + ) + archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].astype(str) + archetyping_data["Floor_area_category"] = np.where( + pd.isnull(archetyping_data["Floor_area_category"]), + "Unknown", + archetyping_data["Floor_area_category"] + ) + archetyping_data = archetyping_data.drop(columns=["Floor_area_category_backup"]) + + archetyping_data["property-type-reduced"] = np.where( + archetyping_data["property-type"].isin(["Flat", "Maisionette"]), + "Flat/Maisonette", + archetyping_data["property-type"] + ) + + archetyping_data["built-form-reduced"] = np.where( + archetyping_data["built-form"].isin(["End-Terrace", "Semi-Detached"]), + "End-Terrace/Semi-Detached", + archetyping_data["built-form"] + ) + archetyping_data["built-form-reduced"] = np.where( + archetyping_data["property-type-reduced"] == "Flat/Maisonette", + "Flat/Maisonette", + archetyping_data["built-form-reduced"] + ) + + archetyping_data["Wall type"] = np.where( + archetyping_data["Wall type"].isin(['Solid ', 'Solid - internal lining ']), + "Solid", + archetyping_data["Wall type"] + ) + archetyping_data["Wall type"] = np.where( + archetyping_data["Wall type"].isin(['Cavity ', 'cavity ']), + "Cavity", + archetyping_data["Wall type"] + ) + + # Proposed remaps based on discoveries + value_remaps = { + # 8 Filey Avenue + "100021040744": { + "variable": "Property type", + "newvalue": "House, mid-terrace", + }, + # 7 Yetev Lev Court + "100021032043": { + "variable": "Wall type", + "newvalue": "Cavity", + }, + # 14 Yetev Lev Court + "100021032050": { + "variable": "Wall type", + "newvalue": "Cavity", + }, + # 23 Yetev Lev Court + "100021032059": { + "variable": "Wall type", + "newvalue": "Cavity", + }, + # 30 Yetev Lev Court + "100021032066": { + "variable": "Wall type", + "newvalue": "Cavity", + }, + # 34 Yetev Lev Court + "100021032070": { + "variable": "Wall type", + "newvalue": "Cavity", + }, + # B 86 Bethune Road + "100021026285": { + "variable": "Wall type", + "newvalue": "Solid", + }, + # A 80 Bethune Road + "100021026277": { + "variable": "Wall type", + "newvalue": "Solid", + }, + # 140 Kyverdale Road + "100021052262": { + "variable": "Property type", + "newvalue": "House, mid-terrace", + }, + # 6 Leabourne Road + "100021053799": { + "variable": "Wall type", + "newvalue": "Solid", + }, + # 22 Britannia Gardens - needs confirmation + # 7 Satanita Road - needs confirmation + # 12 Cheltenham Crescent + "100011402969": { + "variable": "Wall type", + "newvalue": "Cavity", + }, + "100021031752": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + # 79 Craven Park Road + "100021169682": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + # 88 Darenth Road + "100021036148": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + "100021036165": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + "100021036167": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + "100021053849": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + "100021054353": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + "100021054560": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + "100021059839": { + "variable": "Roof type", + "newvalue": "Room Roof" + }, + "100021059848": { + "variable": "Roof type", + "newvalue": "Room Roof" + } + } + + # Perform the remaps + for uprn, config in value_remaps.items(): + archetyping_data[config["variable"]] = np.where( + archetyping_data["uprn"].astype(str) == uprn, config["newvalue"], archetyping_data[config["variable"]] + ) + + # row_id = data[ + # # (data["Address letter or number"] == "C") & + # (data["Street address"].str.strip() == "41 Moresby Road") + # ]["row_id"] + # if len(row_id) != 1: + # raise Exception("Fail") + # print(epc_data[epc_data["row_id"] == row_id.values[0]]["uprn"]) + + # Map the year to the age band + def categorize_year(year): + if isinstance(year, str): + # Handle the case where year is in the format '1930s' + if 's' in year: + year = int(year[:4]) + else: + year = int(year) + else: + year = int(year) + + # Categorize based on year ranges + if year < 1900: + return 'A' + elif 1900 <= year <= 1929: + return 'B' + elif 1930 <= year <= 1949: + return 'C' + elif 1950 <= year <= 1966: + return 'D' + elif 1967 <= year <= 1975: + return 'E' + elif 1976 <= year <= 1982: + return 'F' + elif 1983 <= year <= 1990: + return 'G' + elif 1991 <= year <= 1995: + return 'H' + elif 1996 <= year <= 2002: + return 'I' + elif 2003 <= year <= 2006: + return 'J' + elif 2007 <= year <= 2011: + return 'K' + else: # year >= 2012 + return 'L' + + archetyping_data["SAP_age_band"] = archetyping_data["Property year built"].apply( + categorize_year + ) + + # Flag if the property is in London/Manchester + archetyping_data["Location"] = np.where( + archetyping_data["local-authority-label"].isin( + ["Hackney", "Barnet", "Haringey"] + ), + "London", + np.where( + archetyping_data["local-authority-label"].isin( + ["Salford", "Bury"] + ), + "Manchester", + "Southend" + ) + ) + # 9 Greenview is in manchester + archetyping_data["Location"] = np.where( + archetyping_data["row_id"] == data[data["Street address"] == "9 Greenview"]["row_id"].values[0], + "Manchester", + archetyping_data["Location"] + ) + + # Hackney 73 - London + # Southend-on-Sea 6 - Southend + # Barnet 4 - London + # Castle Point 4 - Southend + # Haringey 3 - London + # Salford 2 - Manchester + # Bury 1 - Manchester + + primary_archetyping_cols = [ + 'Property type', + "Location (Floor)", + 'Current heating system type', + 'Wall type', + 'Roof type', + "Location", + # 'current-energy-rating', 'property-type-reduced', 'built-form-reduced', 'is_cavity_wall', + # 'is_solid_brick_wall', 'is_system_built_wall', 'is_timber_frame_wall', 'is_as_built', + # 'is_solid', 'is_roof_room', + # 'is_loft', 'is_flat', 'is_thatched', + # 'is_at_rafters', 'has_dwelling_above', + # 'conservation_status', ] + + secondary_cols = [ + 'SAP_age_band', + 'is_filled_cavity', + 'insulation_thickness_wall' + 'insulation_thickness_floor' + 'insulation_thickness', + 'is_assumed_wall', + 'is_assumed_roof', + 'Floor_area_category' + ] + + archetypes = archetyping_data[primary_archetyping_cols].drop_duplicates() + # Hash the variables + archetypes["archetype_hash"] = archetypes.apply( + lambda x: hash(tuple(x.values)), + axis=1 + ) + archetypes = archetypes.sort_values("archetype_hash", ascending=True) + archetypes = archetypes.reset_index(drop=True) + archetypes["archetype_id"] = archetypes.index + + archetypes.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/basic-archetypes.csv", index=False) + + # We match properties to archetypes + archetyping_data = archetyping_data.merge( + archetypes, + on=primary_archetyping_cols, + how="left" + ) + + # We should choose a representative property for each archetype + archetyping_data = archetyping_data.merge( + epc_metadata[["row_id", "days_since_last_epc"]], + how="left", + on="row_id" + ) + + # Mark the property with the oldest EPC as the representative property + representative_properties = archetyping_data.sort_values( + ["archetype_id", "days_since_last_epc"], ascending=[True, False] + ).drop_duplicates("archetype_id") + + archetyping_data["for_sample"] = np.where( + archetyping_data["row_id"].isin(representative_properties["row_id"]), + True, + False + ) + + # We save the archetyping data + archetyping_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/archetyping_data.csv", + index=False) + # Save the EPC data + epc_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/epc_data.csv", index=False) + # Save the spatial data + spatial_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge( + spatial_data, + on="row_id", + how="left" + ) + spatial_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/spatial_data.csv", index=False) + + # Save archetyping data + archetyping_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge( + archetyping_data, + on="row_id", + how="left" + ) + + archetyping_data = archetyping_data.drop(columns=["row_id"])