Adding to archetyping

2026-07-27 23:35:01 +00:00 · 2024-09-13 18:05:02 +01:00 · 2024-09-13 18:05:02 +01:00 · 391c6f5cf0
commit 391c6f5cf0
parent 15f55c021f
1 changed files with 408 additions and 0 deletions
--- a/etl/customers/aiha/epc_data_pull.py
+++ b/etl/customers/aiha/epc_data_pull.py
@ -2,6 +2,9 @@ import os
 from tqdm import tqdm
 from dotenv import load_dotenv
 import pandas as pd
+import numpy as np
+import msgpack
+from utils.s3 import read_from_s3
 from backend.SearchEpc import SearchEpc
 from etl.spatial.OpenUprnClient import OpenUprnClient

@ -345,7 +348,63 @@ def app():
    # All properties match up apart from one where the asset data indicates it's in a conservation area, however
    # the sparital data indicates it's not. There do not appear to be any listed/heritage buildings in the portfolio

+    ################################################################
    # Draft archetyping
+    ################################################################
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    epc_data = epc_data.merge(
+        pd.DataFrame(cleaned["walls-description"])[
+            ['original_description',
+             'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
+             'is_as_built', 'is_assumed', 'insulation_thickness']
+
+        ].rename(
+            columns={
+                "is_solid_brick": "is_solid_brick_wall",
+                "is_system_built": "is_system_built_wall",
+                "is_timber_frame": "is_timber_frame_wall",
+                "is_assumed": "is_assumed_wall",
+                "insulation_thickness": "insulation_thickness_wall"
+            }
+        ),
+        left_on="walls-description",
+        right_on="original_description"
+    ).merge(
+        pd.DataFrame(cleaned["roof-description"])[
+            [
+                'original_description', 'is_pitched', 'is_roof_room', 'is_loft',
+                'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
+                'has_dwelling_above', 'insulation_thickness'
+            ]
+        ].rename(
+            columns={
+                "is_assumed": "is_assumed_roof",
+            }
+        ),
+        left_on="roof-description",
+        right_on="original_description"
+    ).merge(
+        pd.DataFrame(cleaned["floor-description"])[
+            [
+                'original_description', 'is_solid', 'is_suspended', 'is_assumed',
+                'insulation_thickness'
+            ]
+        ].rename(
+            columns={
+                "is_assumed": "is_assumed_floor",
+                "insulation_thickness": "insulation_thickness_floor"
+            }
+        ),
+        left_on="floor-description",
+        right_on="original_description"
+    )
+
    archetyping_data = data[
        [
            "row_id",
@ -360,4 +419,353 @@ def app():
            "Window type",
            "Location (Floor)",
        ]
+    ].merge(
+        epc_metadata[["row_id", "floor"]],
+        how="left",
+        on="row_id"
+    ).merge(
+        epc_data[
+            [
+                "row_id", "uprn", "current-energy-rating", "property-type", "built-form", "total-floor-area",
+                'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick_wall', 'is_system_built_wall',
+                'is_timber_frame_wall', 'is_as_built', 'is_assumed_wall', 'insulation_thickness_wall',
+                'is_solid', 'is_suspended', 'is_assumed_floor', 'insulation_thickness_floor',
+                'is_pitched', 'is_roof_room', 'is_loft',
+                'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed_roof',
+                'has_dwelling_above', 'insulation_thickness', "mainheat-description",
+                "local-authority-label"
+            ]
+        ],
+        how="left",
+        on="row_id"
+    ).merge(
+        spatial_data[["row_id", "conservation_status", ]],
+        on="row_id",
+        how="left"
+    )
+
+    if archetyping_data.shape[0] != data.shape[0]:
+        raise Exception("Mismatch in data")
+
+    # We create groups analogous to the Energy Company Obligation
+    # 0 - 72, 73 - 97, 98 - 199, 200+
+    archetyping_data["Floor_area_category"] = pd.cut(
+        archetyping_data["Gross internal area (sqm)"],
+        bins=[0, 72, 97, 199, 1000],
+        labels=["0-72", "73-97", "98-199", "200+"]
+    )
+    archetyping_data["Floor_area_category_backup"] = pd.cut(
+        archetyping_data["total-floor-area"].astype(float),
+        bins=[0, 72, 97, 199, 1000],
+        labels=["0-72", "73-97", "98-199", "200+"]
+    )
+    archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].fillna(
+        archetyping_data["Floor_area_category_backup"]
+    )
+    archetyping_data["Floor_area_category"] = archetyping_data["Floor_area_category"].astype(str)
+    archetyping_data["Floor_area_category"] = np.where(
+        pd.isnull(archetyping_data["Floor_area_category"]),
+        "Unknown",
+        archetyping_data["Floor_area_category"]
+    )
+    archetyping_data = archetyping_data.drop(columns=["Floor_area_category_backup"])
+
+    archetyping_data["property-type-reduced"] = np.where(
+        archetyping_data["property-type"].isin(["Flat", "Maisionette"]),
+        "Flat/Maisonette",
+        archetyping_data["property-type"]
+    )
+
+    archetyping_data["built-form-reduced"] = np.where(
+        archetyping_data["built-form"].isin(["End-Terrace", "Semi-Detached"]),
+        "End-Terrace/Semi-Detached",
+        archetyping_data["built-form"]
+    )
+    archetyping_data["built-form-reduced"] = np.where(
+        archetyping_data["property-type-reduced"] == "Flat/Maisonette",
+        "Flat/Maisonette",
+        archetyping_data["built-form-reduced"]
+    )
+
+    archetyping_data["Wall type"] = np.where(
+        archetyping_data["Wall type"].isin(['Solid ', 'Solid - internal lining ']),
+        "Solid",
+        archetyping_data["Wall type"]
+    )
+    archetyping_data["Wall type"] = np.where(
+        archetyping_data["Wall type"].isin(['Cavity ', 'cavity ']),
+        "Cavity",
+        archetyping_data["Wall type"]
+    )
+
+    # Proposed remaps based on discoveries
+    value_remaps = {
+        # 8 Filey Avenue
+        "100021040744": {
+            "variable": "Property type",
+            "newvalue": "House, mid-terrace",
+        },
+        # 7	Yetev Lev Court
+        "100021032043": {
+            "variable": "Wall type",
+            "newvalue": "Cavity",
+        },
+        # 14 Yetev Lev Court
+        "100021032050": {
+            "variable": "Wall type",
+            "newvalue": "Cavity",
+        },
+        # 23 Yetev Lev Court
+        "100021032059": {
+            "variable": "Wall type",
+            "newvalue": "Cavity",
+        },
+        # 30 Yetev Lev Court
+        "100021032066": {
+            "variable": "Wall type",
+            "newvalue": "Cavity",
+        },
+        # 34 Yetev Lev Court
+        "100021032070": {
+            "variable": "Wall type",
+            "newvalue": "Cavity",
+        },
+        # B	86 Bethune Road
+        "100021026285": {
+            "variable": "Wall type",
+            "newvalue": "Solid",
+        },
+        # A	80 Bethune Road
+        "100021026277": {
+            "variable": "Wall type",
+            "newvalue": "Solid",
+        },
+        # 140 Kyverdale Road
+        "100021052262": {
+            "variable": "Property type",
+            "newvalue": "House, mid-terrace",
+        },
+        # 6 Leabourne Road
+        "100021053799": {
+            "variable": "Wall type",
+            "newvalue": "Solid",
+        },
+        # 22 Britannia Gardens - needs confirmation
+        # 7 Satanita Road - needs confirmation
+        # 12 Cheltenham Crescent
+        "100011402969": {
+            "variable": "Wall type",
+            "newvalue": "Cavity",
+        },
+        "100021031752": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        # 79 Craven Park Road
+        "100021169682": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        # 88 Darenth Road
+        "100021036148": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        "100021036165": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        "100021036167": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        "100021053849": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        "100021054353": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        "100021054560": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        "100021059839": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        },
+        "100021059848": {
+            "variable": "Roof type",
+            "newvalue": "Room Roof"
+        }
+    }
+
+    # Perform the remaps
+    for uprn, config in value_remaps.items():
+        archetyping_data[config["variable"]] = np.where(
+            archetyping_data["uprn"].astype(str) == uprn, config["newvalue"], archetyping_data[config["variable"]]
+        )
+
+    # row_id = data[
+    #     # (data["Address letter or number"] == "C") &
+    #     (data["Street address"].str.strip() == "41 Moresby Road")
+    # ]["row_id"]
+    # if len(row_id) != 1:
+    #     raise Exception("Fail")
+    # print(epc_data[epc_data["row_id"] == row_id.values[0]]["uprn"])
+
+    # Map the year to the age band
+    def categorize_year(year):
+        if isinstance(year, str):
+            # Handle the case where year is in the format '1930s'
+            if 's' in year:
+                year = int(year[:4])
+            else:
+                year = int(year)
+        else:
+            year = int(year)
+
+        # Categorize based on year ranges
+        if year < 1900:
+            return 'A'
+        elif 1900 <= year <= 1929:
+            return 'B'
+        elif 1930 <= year <= 1949:
+            return 'C'
+        elif 1950 <= year <= 1966:
+            return 'D'
+        elif 1967 <= year <= 1975:
+            return 'E'
+        elif 1976 <= year <= 1982:
+            return 'F'
+        elif 1983 <= year <= 1990:
+            return 'G'
+        elif 1991 <= year <= 1995:
+            return 'H'
+        elif 1996 <= year <= 2002:
+            return 'I'
+        elif 2003 <= year <= 2006:
+            return 'J'
+        elif 2007 <= year <= 2011:
+            return 'K'
+        else:  # year >= 2012
+            return 'L'
+
+    archetyping_data["SAP_age_band"] = archetyping_data["Property year built"].apply(
+        categorize_year
+    )
+
+    # Flag if the property is in London/Manchester
+    archetyping_data["Location"] = np.where(
+        archetyping_data["local-authority-label"].isin(
+            ["Hackney", "Barnet", "Haringey"]
+        ),
+        "London",
+        np.where(
+            archetyping_data["local-authority-label"].isin(
+                ["Salford", "Bury"]
+            ),
+            "Manchester",
+            "Southend"
+        )
+    )
+    # 9 Greenview is in manchester
+    archetyping_data["Location"] = np.where(
+        archetyping_data["row_id"] == data[data["Street address"] == "9 Greenview"]["row_id"].values[0],
+        "Manchester",
+        archetyping_data["Location"]
+    )
+
+    # Hackney            73 - London
+    # Southend-on-Sea     6 - Southend
+    # Barnet              4 - London
+    # Castle Point        4 - Southend
+    # Haringey            3 - London
+    # Salford             2 - Manchester
+    # Bury                1 - Manchester
+
+    primary_archetyping_cols = [
+        'Property type',
+        "Location (Floor)",
+        'Current heating system type',
+        'Wall type',
+        'Roof type',
+        "Location",
+        # 'current-energy-rating', 'property-type-reduced', 'built-form-reduced', 'is_cavity_wall',
+        # 'is_solid_brick_wall', 'is_system_built_wall', 'is_timber_frame_wall', 'is_as_built',
+        # 'is_solid', 'is_roof_room',
+        # 'is_loft', 'is_flat', 'is_thatched',
+        # 'is_at_rafters', 'has_dwelling_above',
+        # 'conservation_status',
    ]
+
+    secondary_cols = [
+        'SAP_age_band',
+        'is_filled_cavity',
+        'insulation_thickness_wall'
+        'insulation_thickness_floor'
+        'insulation_thickness',
+        'is_assumed_wall',
+        'is_assumed_roof',
+        'Floor_area_category'
+    ]
+
+    archetypes = archetyping_data[primary_archetyping_cols].drop_duplicates()
+    # Hash the variables
+    archetypes["archetype_hash"] = archetypes.apply(
+        lambda x: hash(tuple(x.values)),
+        axis=1
+    )
+    archetypes = archetypes.sort_values("archetype_hash", ascending=True)
+    archetypes = archetypes.reset_index(drop=True)
+    archetypes["archetype_id"] = archetypes.index
+
+    archetypes.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/basic-archetypes.csv", index=False)
+
+    # We match properties to archetypes
+    archetyping_data = archetyping_data.merge(
+        archetypes,
+        on=primary_archetyping_cols,
+        how="left"
+    )
+
+    # We should choose a representative property for each archetype
+    archetyping_data = archetyping_data.merge(
+        epc_metadata[["row_id", "days_since_last_epc"]],
+        how="left",
+        on="row_id"
+    )
+
+    # Mark the property with the oldest EPC as the representative property
+    representative_properties = archetyping_data.sort_values(
+        ["archetype_id", "days_since_last_epc"], ascending=[True, False]
+    ).drop_duplicates("archetype_id")
+
+    archetyping_data["for_sample"] = np.where(
+        archetyping_data["row_id"].isin(representative_properties["row_id"]),
+        True,
+        False
+    )
+
+    # We save the archetyping data
+    archetyping_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/archetyping_data.csv",
+                            index=False)
+    # Save the EPC data
+    epc_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/epc_data.csv", index=False)
+    # Save the spatial data
+    spatial_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge(
+        spatial_data,
+        on="row_id",
+        how="left"
+    )
+    spatial_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/spatial_data.csv", index=False)
+
+    # Save archetyping data
+    archetyping_data = data[["row_id", "Address letter or number", "Street address", "Postcode"]].merge(
+        archetyping_data,
+        on="row_id",
+        how="left"
+    )
+
+    archetyping_data = archetyping_data.drop(columns=["row_id"])