import os import re import openpyxl from fuzzywuzzy import fuzz from pathlib import Path import msgpack from datetime import datetime import pandas as pd import numpy as np from utils.s3 import ( read_from_s3, read_dataframe_from_s3_parquet, save_pickle_to_s3, read_pickle_from_s3, save_dataframe_to_s3_parquet ) from utils.logger import setup_logger from dotenv import load_dotenv from tqdm import tqdm from backend.SearchEpc import SearchEpc from etl.eligibility.Eligibility import Eligibility from etl.eligibility.ha_15_32.app import prepare_model_data_row from backend.ml_models.api import ModelApi from etl.solar.SolarPhotoSupply import SolarPhotoSupply from recommendations.recommendation_utils import calculate_cavity_age from etl.epc.Record import EPCRecord from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes from etl.epc.DataProcessor import EPCDataProcessor from datetime import datetime import inspect src_file_path = inspect.getfile(lambda: None) EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") ENV_FILE = Path(src_file_path).parent / "etl" / "eligibility" / "ha_15_32" / ".env" DATA_FOLDER = Path(src_file_path).parent / "local_data" / "ha_data" logger = setup_logger() load_dotenv(ENV_FILE) PROPERTY_TYPE_LOOKUP = { "HA1": { "built_form": { 'Mid Terrace': 'Mid-Terrace', 'Semi-Detached': 'Semi-Detached', 'End Terrace': 'End-Terrace', 'Detached': 'Detached', 'Enclosed Mid': 'Mid-Terrace', 'Detached Local Connect': 'Detached', } }, "HA2": { 'HOUSE': 'House', 'FLAT': 'Flat', 'SHELTERED': None, 'BUNGALOW': 'Bungalow', 'BED-SIT': None, 'MAISONETTE': "Maisonette", 'HOSTEL': None }, "HA5": { "House": "House", "Flat": "Flat", "Bungalow": "Bungalow", "Bedsit": None }, "HA6": { "property_type": { 'HOUSE': "House", 'GROUND FLOOR FLAT': "Flat", 'UPPER FLOOR FLAT': "Flat", 'MAISONETTE': "Maisonette", 'BUNGALOW': "Bungalow", 'WARDEN BUNGALOW': "Bungalow", 'WARDEN FLAT': "Flat", 'EXTRACARE SCHEME': "Flat", } }, "HA7": { "property_type": { "House": "House", "Flat": "Flat", "Bungalow": "Bungalow", "Maisonette": "Maisonette", }, "built_form": { "Semi Detached": "Semi-Detached", "Mid Terrace": "Mid-Terrace", "End Terrace": "End-Terrace", "Detached": "Detached", "End Terraced": "End-Terrace", } }, "HA8": { "House": "House", "Flat": "Flat", "Bungalow": "Bungalow", "Maisonette": "Maisonette", "Bedsit": None, "Room": None, "Other": None, "Commerical": None }, "HA11": { "Flat": "Flat", "House": "House", "Semi-Det House": "House", "Bedsit": None, "End-Terr House": "House", "Mid-Terr House": "House", "Bungalow": "Bungalow", "Maisonette": "Maisonette", "End Terr Flat": "Flat", "Mid Terr Flat": "Flat", "Detached Flat": "Flat", }, "HA12": { "House": "House", "Flat": "Flat", "Bungalow": "Bungalow", "Maisonette": "Maisonette", "Bedsit": None, }, "HA13": { 'House': "House", 'Flat': "Flat", 'House MT': "House", 'House SD': "House", 'House ET': "House", 'Bungalow MT': "Bungalow", 'Bungalow ET': "Bungalow", 'ii': None, }, "HA14": { "property_type": { "House": "House", "Flat": "Flat", "Bungalow": "Bungalow", "Maisonette": "Maisonette", } }, "HA15": { 'House': 'House', 'Flat': 'Flat', 'Bungalow': 'Bungalow', 'Maisonette': 'Maisonette', 'Flat over garage': 'Flat', }, "HA16": { 'Semi Detached Bungalow': {"property-type": "Bungalow", "built-form": "Semi-Detached"}, 'Mid Terraced House': {"property-type": "House", "built-form": "Mid-Terrace"}, 'End Terraced House': {"property-type": "House", "built-form": "End-Terrace"}, 'Low Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"}, 'Semi-Detached House': {"property-type": "House", "built-form": "Semi-Detached"}, 'Detached Bungalow': {"property-type": "Bungalow", "built-form": "Detached"}, 'End Terraced Bungalow': {"property-type": "Bungalow", "built-form": "End-Terrace"}, 'Mid Terraced Bungalow': {"property-type": "Bungalow", "built-form": "Mid-Terrace"}, 'Medium Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"}, 'Detached House': {"property-type": "House", "built-form": "Detached"}, 'Cottage Flat': {"property-type": "Flat", "built-form": "Semi-Detached"}, 'Maisonette Medium Rise': {"property-type": "Flat", "built-form": "Mid-Terrace"}, 'Maisonette Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"}, 'End Terraced Town House': {"property-type": "House", "built-form": "End-Terrace"}, 'Flat Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"}, 'Mid Terraced Town House': {"property-type": "House", "built-form": "Mid-Terrace"}, }, "HA18": { "House": "House", "Flat": "Flat", "Bungalow": "Bungalow", "Maisonette": "Maisonette", "Bedsit": None, "Shop": None, "Hostel": None, "Block": None, }, "HA20": { "House": "House", "Flat": "Flat", 'Sheltered Flat': "Flat", 'Maisonette': 'Maisonette', 'Bungalow': 'Bungalow', 'House. SD': 'House', 'House. MT': 'House', 'House. ET': 'House', 'Sheltered Bungalow': 'Bungalow', 'Guest Accomodation': None, 'Sheltered House': 'House', 'House. MT ': 'House', 'House. D': 'House' }, "HA24": { '01 HOUSE': 'House', '02 FLAT': 'Flat', '03 BUNGALOW': 'Bungalow', '10 PBUNGALOW': 'Bungalow', '01 HOUSE MID': 'House', '13 SBUNGALOW': 'Bungalow', '12 SBEDSIT': None, # BEDSIT does not match the specified property types '14 SFLAT': 'Flat', '05 BEDSIT': None, '04 MAISONETTE': 'Maisonette', '11 PFLAT': 'Flat', '09 PBEDSIT': None }, "HA25": { 'Flat': 'Flat', 'Mid Terrace House': 'House', 'Semi Detached House': 'House', 'End Terrace House': 'House', 'House': 'House', 'Semi Detached Bung': 'Bungalow', 'Bungalow': 'Bungalow', 'End Terrace Bungalow': 'Bungalow', 'Maisonnette': 'Maisonette', 'Mid Terrace Bungalow': 'Bungalow', 'Bedspace': None, 'Detached House': 'House', 'Bedsit': 'Flat', 'Coach House': 'House', 'Detached Bungalow': 'Bungalow', 'Office Buildings': None, 'Guest Room': None, 'Mid Terrace Housekeeping ': 'House', 'End Terrace Housex': 'House' }, "HA28": { 'Flat': 'Flat', 'Semi detached house': 'House', 'Terraced house': 'House', 'Maisonette flat': 'Maisonette', 'Sheltered bedsit': None, 'APD flat': 'Flat', 'Bungalow terraced': 'Bungalow', 'Flat with partition': 'Flat', 'Bungalow semi detached': 'Bungalow', 'APD Bungalow': 'Bungalow', 'Sheltered flat': 'Flat', 'Bedsit Flat': 'Flat', 'Bedsit bungalow semi detached': 'Bungalow', 'Sheltered bungalow terraced': 'Bungalow', 'Sheltered bedsit disabled': None, 'Bedsit bungalow terraced': 'Bungalow', 'Sheltered bungalow semi detached': 'Bungalow', 'Sheltered warden flat': 'Flat', 'Bungalow detached': 'Bungalow', 'Block': None, # Does not match the specified property types 'End Terraced House': 'House', 'Mid Terraced House': 'House', '#N/A': None, # Assuming this is an invalid or missing entry 0: None # Assuming 0 is also an invalid or missing entry }, "HA30": { 'House': 'House', 'Flat': 'Flat', 'Bungalow': 'Bungalow', 'House with Attached Garage': 'House', 'Bed Space': None, # Assuming this does not fit the specified property types 'House with Garage': 'House', 'Bungalow with Wheelchair Access': 'Bungalow', 'Maisonette': 'Maisonette', 'Flat with Wheelchair Access': 'Flat', 'Bedsit': None, # Assuming this does not fit the specified property types 'Flat w Wheelchair Access & Car Park': 'Flat', 'House with Wheelchair Access': 'House', 'Bungalow w Wheelchair Access & Car ': 'Bungalow' }, "HA32": { 'Bungalow': 'Bungalow', 'Flat': 'Flat', 'Bungalow Disabled': 'Bungalow', # "Disabled" properties categorized with their base type 'House': 'House', 'Dormer Bungalow': 'Bungalow', 'Pop-In': None, # Does not fit the specified property types 'Flat Disabled': 'Flat', 'Laundry': None, # Does not fit the specified property types 'Bedsit': None, # Excluded from the given categories 'Shed': None, # Does not fit the specified property types 'Store Room': None # Does not fit the specified property types }, "HA34": { 'Flat': 'Flat', 'House': 'House', 'Bungalow': 'Bungalow', 'Maisonette': 'Maisonette', 'ND': None, }, "HA35": { "Flat": "Flat", "Maisonette": "Maisonette", "House": "House", "Bedsit": None, "2 Bedroom Unknown": None, "1 Bedroom Unknown": None, "3 Bedroom Unknown": None, "4 Bedroom Unknown": None, }, "HA37": { "FLT": "Flat", "HSE": "House", "BNW": "Bungalow", "MAS": "Maisonette", "HSL": None }, "HA39": { "Semi house": {"property_type": "House", "built_form": "Semi-Detached"}, "1st floor flat": {"property_type": "Flat", "built_form": None}, "Mid terrace house": {"property_type": "House", "built_form": "Mid-Terrace"}, "Ground floor flat": {"property_type": "Flat", "built_form": None}, "End terrace house": {"property_type": "House", "built_form": "End-Terrace"}, "Semi bungalow": {"property_type": "Bungalow", "built_form": "Semi-Detached"}, "End terrace bungalow": {"property_type": "Bungalow", "built_form": "End-Terrace"}, "2nd floor flat": {"property_type": "Flat", "built_form": None}, "Mid terrace bungalow": {"property_type": "Bungalow", "built_form": "Mid-Terrace"}, "3rd floor flat": {"property_type": "Flat", "built_form": None}, "Detached bungalow": {"property_type": "Bungalow", "built_form": "Detached"}, "Maisonette": {"property_type": "Maisonette", "built_form": None}, "Detached house": {"property_type": "House", "built_form": "Detached"}, "Lower ground floor flat": {"property_type": "Flat", "built_form": None}, "Dormer bungalow": {"property_type": "Bungalow", "built_form": None}, "Basement flat": {"property_type": "Flat", "built_form": None}, "Cluster House": {"property_type": "House", "built_form": "Detached"}, "2nd/3rd floor duplex flat": {"property_type": "Flat", "built_form": None}, "Ground floor flat with study": {"property_type": "Flat", "built_form": None}, "4th floor flat": {"property_type": "Flat", "built_form": None}, "1st floor flat with study room": {"property_type": "Flat", "built_form": None}, "2nd floor flat with study": {"property_type": "Flat", "built_form": None}, }, "HA41": { 'Garage': None, 'House 1919-1945': 'House', 'House 1946-1964': 'House', 'Flats & Maisonettes post 1974': 'Flat', 'Non traditional houses': 'House', 'Sheltered': None, 'Flats & Maisonettes 1965-1974': 'Flat', 'House post 1974': 'House', 'Block': None, 'Flats & Maisonettes 1946-1964': 'Flat', 'House 1965-1974': 'House', 'Non traditional flats': 'Flat', 'Bungalow 1965-1974': 'Bungalow', 'PIMSS EMPTY': None, 'Bungalow post 1974': 'Bungalow', 'Bungalow 1946-1964': 'Bungalow', 'Flats & Maisonettes 1919-1945': 'Flat', 'House pre 1919': 'House', 'Flats & Maisonettes pre 1919': 'Flat', 'Bungalow 1919-1945': 'Bungalow', 'Office': None }, "HA42": { 'Flat': 'Flat', 'House': 'House', 'Flat Basement': 'Flat', 'Room': None, 'Bedsit Flat': 'Flat', 'Maisonette': 'Maisonette', 'Scheme Office': None, 'Scheme Lounge': None, 'Bungalow': 'Bungalow', 'Garage': None, 'Scheme Sleep Room': None, 'Cluster': None, 'Scheme Room': None }, "HA45": { 'Large block of flats': 'Flat', 'Small block of flats/dwelling converted in to flats': 'Flat', 'Semi-detached house': 'House', 'Mid-terraced house': 'House', 'End-terraced house': 'House', 'Block of flats': 'Flat', 'Detached house': 'House', 'Flat in mixed use building': 'Flat', }, "HA48": { "House": "House", "Flat": "Flat", "Bungalow": "Bungalow", "Maisonette": "Maisonette", "Unit": None }, "HA50": { 'House': 'House', 'Bungalow': 'Bungalow', 'Flat': 'Flat', 'House SD': 'House', 'House MT': 'House', 'House ET': 'House', 'Bungalow ET': 'Bungalow', 'House SD ': 'House', 'House. SD': 'House', 'Bungalow SD': 'Bungalow', 'Bungalow MT': 'Bungalow', 'Bungalow D': 'Bungalow', 'House D': 'House', 'House. MT': 'House', 'House ': 'House', 'House ET ': 'House', ' ': None, 'Flat?': 'Flat', 'Bungalow ': 'Bungalow' }, "HA51": { 'FLAT': 'Flat', 'HOUSE': 'House', 'MAISONETTE': 'Maisonette', 'BEDSIT': None, # Considering as a non-specific residential category here 'BUNGALOW': 'Bungalow', }, "HA52": { 'House - Mid Terrace': 'House', 'Flat - First Floor': 'Flat', 'Flat - Ground Floor': 'Flat', 'House - Semi-Detached': 'House', 'House - End Terrace': 'House', 'Flat - Second Floor': 'Flat', 'Bedsit': None, # Considering as a non-specific residential category here 'Bungalow - Semi-Detached': 'Bungalow', 'Bungalow - Mid Terrace': 'Bungalow', 'Bungalow - End Terrace': 'Bungalow', 'House - Detached': 'House', 'Flat - Third Floor': 'Flat', 'House attached to flats': 'House', 'Flat - Fourth Floor': 'Flat', 'Bungalow - Detached': 'Bungalow' }, "HA56": { 'House Non Specific': 'House', 'HOUSE TERRACED': 'House', 'HOUSE - SEMI DETACHD': 'House', 'Bungalow': 'Bungalow', 'House - End Terraced': 'House', 'Block': None, 'Block with Communal': None, 'Bungalow - Terraced': 'Bungalow', 'Bungalow - Semi Dtch': 'Bungalow', 'Block House with rooms': None, 'Bungalow - End Terr': 'Bungalow', 'House - Mid Terraced': 'House', 'Bungalow - Detached': 'Bungalow', 'House - Detached': 'House', 'HOUSE THREE STOREY': 'House', 'Maisonette': 'Maisonette', 'Communal Block': None, 'Scheme': None }, "HA63": { 'Flat': 'Flat', 'House - Semi detached': 'House', 'House - Detached': 'House', 'House - End Terrace': 'House', 'House - Mid Terrace': 'House', 'Bungalow - Semi detached': 'Bungalow', 'Bungalow': 'Bungalow', 'Bedsit': None, # Considering as a non-specific residential category here 'Maisonette': 'Maisonette', 'Bungalow - End Terrace': 'Bungalow', 'Bungalow - Detached': 'Bungalow', 'Maisonette - Mid Terrace': 'Maisonette', 'Maisonette - End Terrace': 'Maisonette', 'Studio Flat': 'Flat', 'Maisonette - Detached': 'Maisonette', 'Bungalow - Mid Terrace': 'Bungalow', 'Bedsit - Mid Terrace': None, 'Bedsit - End Terrace': None, 'Amenity Block - Semi detached': None, # Assuming non-residential 'Maisonette - Semi Detached': 'Maisonette', 'Amenity Block - Detached': None, # Assuming non-residential 'Hostel': None, # Typically not considered a standard residential property for this context 'Bungalow - Attached': 'Bungalow', 'Unknown': None, # Not enough information to categorize 'Studio Flat - Mid Terrace': 'Flat', 'Chalet - Wheelchair': None # Specialized type, not categorized here }, "HA107": { "property_type": { "HOUSE": "House", "BUNGALOW": "Bungalow", "GRD FLOOR FLAT": "Flat", "FIRST FLOOR FLAT": "Flat", "SHELTERED BUNGALOW": "Bungalow", "MAISONETTE": "Maisonette", "SECOND FLOOR FLAT": "Flat", "SHELTERED FIRST FLR": "Flat", "SHELTERED GROUND FLR": "Flat", "GRD FLOOR BED SIT": "House" }, "built_form": { "Semi Detached": "Semi-Detached", "Mid Terrace": "Mid-Terrace", "End Terrace": "End-Terrace", "Detached": "Detached", "Detatched": "Detached", } }, "HA117": { "Flat": "Flat", "House": "House", "Bungalow": "Bungalow", "Flat over garage/underpass": "Flat", }, "HAXXX": { 'mid terraced house': 'House', 'semi detached house': 'House', '1st fl 4 in a block': 'Flat', 'G/F 4 in a block': 'Flat', 'end terraced house': 'House', '1st floor flat': 'Flat', 'G/F floor flat': 'Flat', 'semi detached bungalow': 'Bungalow', '2nd floor flat': 'Flat', 'mid terrace bungalow': 'Bungalow', 'detached bungalow': 'Bungalow', 'end terrace bungalow': 'Bungalow', 'Staff accommodation': None # Marked as None due to its special nature } } class DataLoader: COLUMN_CONFIG = { "HA1": { "address": "Address", "postcode": "Address - Postcode" }, "HA5": { "address": "Address", "postcode": "matching_postcode" }, "HA6": { "address": "propertyaddress", "postcode": "address" # The 'address' column actually contains postcode }, "HA12": { "address": "Full Address", "postcode": "Postcode" }, "HA16": { "address": "Address", "postcode": "Postcode" }, "HA24": { "address": "Address", "postcode": "Postcode" }, "HA25": { "address": "T1_Address", "postcode": "matching_postcode" }, "HA30": { "address": "A_Address", "postcode": "A_Postcode" }, "HA31": { "address": "A_Address", "postcode": "matching_postcode" }, "HA45": { "address": "Full postal address", "postcode": "Postcode" }, "HA48": { "address": "Full Address", "postcode": "Postcode" }, "HA49": { "address": "Property Address Full", "postcode": "Property Postcode" }, "HA52": { "address": "Postal Address", "postcode": "POSTCODE" }, "HA54": { "address": "Postal Address", "postcode": "matching_postcode" } } UNMATCHED_CIGA = { "HA2": 0, "HA6": 117, "HA9": 0, "HA12": 6, "HA13": 119, "HA14": 3, "HA15": 3, "HA16": 7, "HA24": 12, "HA50": 4, "HA63": 15, "HA107": 51, "HA48": 0, "HA45": 0, "HA52": 5, "HA20": 6 } UNMATCHED_ECO3 = { "HA25": 154, "HA41": 26, "HA50": 5, "HA56": 320, "HA63": 0, "HA117": 4, "HA51": 24 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): self.directories = directories self.use_cache = use_cache self.december_figures_filepath = december_figures_filepath self.rebuild = rebuild self.data = {} self.december_figures = None self.facts_and_figures = None def create_asset_list_matching_address(self, ha_name, asset_list): if ha_name in [ "HA1", "HA5", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA52", "HA54" ]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list[ self.COLUMN_CONFIG[ha_name]["postcode"] ].astype(str).str.lower().str.strip() elif ha_name == "HA2": # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA7": # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA8": asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA9": asset_list["matching_address"] = asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA11": asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Post Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA13": asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["address 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA14": # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA15": asset_list["matching_address"] = ( asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA18": asset_list["matching_address"] = ( asset_list["Address"].astype(str).str.lower().str.strip() + ", " + asset_list["Post Code"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA19": asset_list["matching_address"] = ( asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA20": asset_list["matching_address"] = ( asset_list["House Name"].astype(str).str.lower().str.strip() + ", " + asset_list["Block"].astype(str).str.lower().str.strip() + ", " + asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA21": asset_list["matching_address"] = ( asset_list["Address"].astype(str).str.lower().str.strip() + ", " + asset_list["PostCode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() elif ha_name == "HA25": asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list['matching_address'].apply( lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x ) elif ha_name == "HA27": asset_list["matching_address"] = ( asset_list[" Address"].astype(str).str.lower().str.strip() + ", " + asset_list[" Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA28": asset_list["matching_address"] = ( asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + asset_list["Street 1"].astype(str).str.lower().str.strip() + ", " + asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA32": asset_list["matching_address"] = ( asset_list["Dwelling num"].astype(str).str.lower().str.strip() + ", " + asset_list["Street"].astype(str).str.lower().str.strip() + ", " + asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA33": asset_list["matching_address"] = ( asset_list["ADDRESS"].astype(str).str.lower().str.strip() + ", " + asset_list["POST CODE"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["POST CODE"].astype(str).str.lower().str.strip() elif ha_name == "HA34": asset_list["matching_address"] = ( asset_list[" Address"].astype(str).str.lower().str.strip() + ", " + asset_list[" Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA35": asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Post Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Address Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA37": asset_list["matching_address"] = asset_list["ADDRESS LINE 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["ADDRESS LINE 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["ADDRESS LINE 3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["POSTCODE"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip() elif ha_name == "HA38": asset_list["matching_address"] = asset_list["House_Number"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address_Line_1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address_Line_2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address_Line_3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA39": # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["add_2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["add_3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \ asset_list["post_code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["post_code"].astype(str).str.lower().str.strip() elif ha_name == "HA41": asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["AddressLine3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["AddressLine4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["AddressLine5"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA42": asset_list["matching_address"] = asset_list["Dwelling Number"].astype(str).str.lower().str.strip() + " " + \ asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Locality"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA44": asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postal Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postal Code"].astype(str).str.lower().str.strip() elif ha_name == "HA50": asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Post Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA51": asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_address"] = np.where( asset_list["Block"].str.strip().str.len() > 0, asset_list["Block"].astype(str).str.lower().str.strip() + ", " + \ asset_list["matching_address"], asset_list["matching_address"] ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA56": asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Post Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA63": asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["POSTCODE"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip() elif ha_name == "HA70": asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["POSTCODE"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip() elif ha_name == "HA107": # Create matching_address by concatenating House No, Street, Town, District, Postcode asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \ asset_list["District"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA117": asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["PostCode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() elif ha_name == "HAXX": asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \ asset_list["PostCode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() elif ha_name == "HAXXX": asset_list["matching_address"] = ( asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " + asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() else: raise NotImplementedError("implement me") return asset_list @staticmethod def extract_property_info_ha107(properties): property_types = { "House": "House", "Flat": "Flat", "Bungalow": "Bungalow", "Maisonette": "Maisonette", "Bedsit": None } built_forms = { "Detached": "Detached", "Semi Detached": "Semi-Detached", "End Terrace": "End-Terrace", "Mid Terrace": "Mid-Terrace" } # Function to extract property type and built form from a description def extract_from_description(description): property_type = None built_form = None for key in property_types: if key in description: property_type = property_types[key] break for key in built_forms: if key in description: built_form = built_forms[key] break return property_type, built_form # Process each property in the list results = [] for property_description in properties: property_type, built_form = extract_from_description(property_description) results.append( { "Property type": property_description, "property_type": property_type, "built_form": built_form } ) results = pd.DataFrame(results) return results def append_asset_list_built_form(self, ha_name, asset_list): # Finally, we process property_type or built form, where needed if ha_name == "HA6": asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6) if ha_name == "HA107": mapped_df = self.extract_property_info_ha107(asset_list["Property type"].unique()) asset_list = asset_list.merge( mapped_df, how="left", on="Property type" ) return asset_list @staticmethod def create_asset_list_house_no(ha_name, asset_list): """ This function will append the House number onto the asset list :return: """ if ha_name == "HA107": asset_list["HouseNo"] = asset_list["House No"].copy() elif ha_name == "HA32": asset_list["HouseNo"] = asset_list["Dwelling num"].copy() elif ha_name == "HA28": asset_list["HouseNo"] = asset_list["House Number"].copy() elif ha_name == "HA38": asset_list["HouseNo"] = asset_list["House_Number"].copy() elif ha_name == "HA9": asset_list["HouseNo"] = asset_list["House Number"].copy() elif ha_name == "HAXXX": asset_list["HouseNo"] = asset_list["Door Number"].copy() else: split_addresses = asset_list['matching_address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) # If we have "flat" or valley" as the house number, then the house number is actually in the second column house_numbers[0] = np.where(house_numbers[0].isin(["flat", "valley"]), house_numbers[1], house_numbers[0]) # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how # many columns there might be house_numbers = house_numbers.iloc[:, 0:1] house_numbers.columns = ['HouseNo'] # Remove trailing punctuation such as , or ; house_numbers["HouseNo"] = house_numbers["HouseNo"].str.rstrip(',;') asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) return asset_list @staticmethod def create_ciga_list_house_no(ciga_list): """ This function will append the House number onto the asset list :return: """ split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how # many columns there might be house_numbers = house_numbers.iloc[:, 0:1] house_numbers.columns = ['HouseNo'] ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1) return ciga_list @staticmethod def dedupe_ciga_list(ciga_list): ciga_list["unique_key"] = ciga_list["Matched Address"] + ciga_list["Matched Postcode"] # Remove spaces from the unique key ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(" ", "") # Remove punctuation from the unique key ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(r'[^\w\s]', '') # Drop duplicated keys ciga_list = ciga_list[~ciga_list["unique_key"].duplicated()] return ciga_list @staticmethod def get_asset_sheetname(workbook): if "Asset List" in workbook.sheetnames: return "Asset List" elif "Asset list" in workbook.sheetnames: return "Asset list" elif "Asset" in workbook.sheetnames and "Assets" not in workbook.sheetnames: return "Asset" elif "Decent Homes Stock" in workbook.sheetnames: return "Decent Homes Stock" elif "Report" in workbook.sheetnames: return "Report" else: return "Assets" @staticmethod def get_ciga_sheetname(workbook): if "CIGA Checks" in workbook.sheetnames: return "CIGA Checks" elif "CIGA checks" in workbook.sheetnames: return "CIGA checks" elif "CIGA check" in workbook.sheetnames: return "CIGA check" elif "CIGA Check" in workbook.sheetnames: return "CIGA Check" elif "CIGA requested" in workbook.sheetnames: return "CIGA requested" else: return "CIGA" @staticmethod def get_survey_sheetname(workbook): if "ECO Surveys" in workbook.sheetnames: return "ECO Surveys" elif "ECO Survey" in workbook.sheetnames: return "ECO Survey" elif "ECO 4 Surveys completed" in workbook.sheetnames: return "ECO 4 Surveys completed" elif "ECO4 Surveys" in workbook.sheetnames: return "ECO4 Surveys" else: return "ECO surveys" @staticmethod def correct_ha51_asset_list(asset_list): # Correct this asset_list["HouseNo"] = np.where( asset_list["matching_address"].str.contains("61 wandle bank"), asset_list["Block"].str.lower(), asset_list["HouseNo"] ) return asset_list def prepare_ha17(self, workbook): blocks_sheet = workbook["Blocks List - Cavity Wall only"] blocks_data = [] blocks_colnames = [cell.value for cell in blocks_sheet[2]] for row in blocks_sheet.iter_rows(min_row=4, values_only=False): row_data = [cell.value for cell in row] # This will get you the cell values blocks_data.append(row_data) blocks_df = pd.DataFrame(blocks_data, columns=blocks_colnames) blocks_df["matching_address"] = ( blocks_df["Block Name\n[as per Naming Convention procedure]"].astype(str).str.lower().str.strip() + ", " + blocks_df["Block Street Name"].astype(str).str.lower().str.strip() + ", " + blocks_df["Postcode"].astype(str).str.lower().str.strip() ) blocks_df["matching_postcode"] = blocks_df["Postcode"].astype(str).str.lower().str.strip() blocks_df["property_type"] = "Flat" street_properties_sheet = workbook["Street Properties - Cavity Wall"] street_properties_data = [] street_properties_colnames = [cell.value for cell in street_properties_sheet[2]] for row in street_properties_sheet.iter_rows(min_row=3, values_only=False): row_data = [cell.value for cell in row] # This will get you the cell values street_properties_data.append(row_data) street_properties_df = pd.DataFrame(street_properties_data, columns=street_properties_colnames) street_properties_df["matching_address"] = ( street_properties_df["Block Name\n[as per Naming Convention procedure]"].astype( str).str.lower().str.strip() + ", " + street_properties_df["Postcode"].astype(str).str.lower().str.strip() ) street_properties_df["matching_postcode"] = street_properties_df["Postcode"].astype(str).str.lower().str.strip() street_properties_df["property_type"] = street_properties_df[ "Block typology based on dwelling type\n[defined list]" ] asset_list_compressed = pd.concat( [ blocks_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]], street_properties_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]] ], axis=0 ) # We expand range_pattern = r"(\d+)\s+to\s+(\d+)\s+(.*)" asset_list = [] for _, row in tqdm(asset_list_compressed.iterrows(), total=len(asset_list_compressed)): if row["ECO Eligibility"] == "Not Eligible": asset_list.append(row.to_dict()) continue # Detect a house number range match = re.search(range_pattern, row["matching_address"]) if not match: asset_list.append(row.to_dict()) continue # Extracting the start and end of the range start_number = int(match.group(1)) end_number = int(match.group(2)) rest_of_address = match.group(3) # Generating the list of house numbers house_numbers = list(range(start_number, end_number + 1)) data_to_extend = [] for house_number in house_numbers: new_adress = f"{house_number} {rest_of_address}" entry = row.to_dict().copy() entry.update({"matching_address": new_adress}) data_to_extend.append(entry) asset_list.extend(data_to_extend) asset_list = pd.DataFrame(asset_list) # Add in asset_list_row_id asset_list["asset_list_row_id"] = ["HA17" + str(i) for i in range(0, len(asset_list))] # Add on house number asset_list = self.create_asset_list_house_no(ha_name="HA17", asset_list=asset_list) return asset_list def load_asset_list(self, filepath, ha_name): workbook = openpyxl.load_workbook(filepath) if ha_name == "HA17": asset_list = self.prepare_ha17(workbook) return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame() else: asset_sheetname = self.get_asset_sheetname(workbook) asset_sheet = workbook[asset_sheetname] asset_sheet_colnames = [cell.value for cell in asset_sheet[1]] if ha_name == "HA25": asset_sheet_colnames[11] = "matching_postcode" if ha_name == "HA31": asset_sheet_colnames[2] = "matching_postcode" if ha_name == "HA54": asset_sheet_colnames[10] = "matching_postcode" if ha_name == "HA5": asset_sheet_colnames[2] = "matching_postcode" rows_data = [] for row in asset_sheet.iter_rows(min_row=2, values_only=False): row_data = [cell.value for cell in row] # This will get you the cell values rows_data.append(row_data) asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames) asset_list = asset_list.loc[:, asset_list.columns.notnull()] # Remove entirely empty rows - consider all rows apart from row_color asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)] # Add in asset_list_row_id asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))] # Create matching address and matching postcode asset_list = self.create_asset_list_matching_address(ha_name=ha_name, asset_list=asset_list) asset_list = self.create_asset_list_house_no(ha_name=ha_name, asset_list=asset_list) asset_list = self.append_asset_list_built_form(ha_name=ha_name, asset_list=asset_list) # We correct the asset list if it needs it # Correct the asset list correction_function_name = f"correct_{ha_name.lower()}_asset_list" if hasattr(self, correction_function_name): asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list") asset_list = asset_list_correction_function(asset_list) # For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga # lists, and so # we can return the asset list now if ha_name in ["HA1", "HA27"]: return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame() # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be # suitable under ECO4, since their walls will be filled eco3_list = pd.DataFrame() sheetnames_lower = [x.lower() for x in workbook.sheetnames] eco3_sheetname_index = [i for i, x in enumerate(sheetnames_lower) if "eco3" in x.replace(" ", "")] if eco3_sheetname_index: eco3_sheetname = workbook.sheetnames[eco3_sheetname_index[0]] eco3_sheet = workbook[eco3_sheetname] eco3_rows = [] for row in eco3_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers row_data = [cell.value for cell in row] # This will get you the cell values eco3_rows.append(row_data) eco3_list = pd.DataFrame(eco3_rows, columns=[cell.value for cell in eco3_sheet[1]]) # Remove columns that are None eco3_list = eco3_list.loc[:, eco3_list.columns.notnull()] # Remove rows that are completely empty eco3_list = eco3_list.loc[eco3_list.loc[:, eco3_list.columns].notnull().any(axis=1)] eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))] # Perform the eco3 merge if not eco3_list.empty: eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name) if ha_name in ["HA25"]: # Accomodate ha25 unique structure return asset_list, pd.DataFrame(), pd.DataFrame(), eco3_list # We check if there is a survey list survey_sheetname = self.get_survey_sheetname(workbook) survey_sheet = workbook[survey_sheetname] survey_rows = [] for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers row_data = [cell.value for cell in row] # This will get you the cell values survey_rows.append(row_data) survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) # Remove columns that are None survey_list = survey_list.loc[:, survey_list.columns.notnull()] # Remove rows that are completely empty survey_list = survey_list.loc[survey_list.loc[:, survey_list.columns].notnull().any(axis=1)] survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))] # Perform survey list merge if not survey_list.empty: survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name) # We check if there are CIGA checks ciga_sheetname = self.get_ciga_sheetname(workbook) ciga_sheet = workbook[ciga_sheetname] ciga_rows = [] for row in ciga_sheet.iter_rows(min_row=2, values_only=False): row_data = [cell.value for cell in row] # This will get you the cell values ciga_rows.append(row_data) ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) # Remove columns that are None ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] # Remove rows that are completely None ciga_list = ciga_list.loc[ciga_list.loc[:, ciga_list.columns].notnull().any(axis=1)] # Perform ciga list merge if not ciga_list.empty: # Remove rows with missing postcode which happens in a small number of cases ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] ciga_list = self.create_ciga_list_house_no(ciga_list) ciga_list = self.dedupe_ciga_list(ciga_list) ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) return asset_list, survey_list, ciga_list, eco3_list @staticmethod def correct_ha6_asset_list(asset_list): asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place") asset_list["matching_address"] = asset_list["matching_address"].str.replace("baggott place", "baggotts place") asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree") asset_list["matching_address"] = asset_list["matching_address"].str.replace("cherry tree", "cherrytree") asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close") asset_list["matching_address"] = asset_list["matching_address"].str.replace("maryhill close", "mary hill close") asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way") asset_list["matching_address"] = asset_list["matching_address"].str.replace("moffat way", "moffatt way") return asset_list @staticmethod def correct_ha56_asset_list(asset_list): # CH1 4JR has already been surveyed, but it's listed in the asset list # as a single row, when it's actually 32 units, so we just set this # as ineligible asset_list["ECO Eligibility"] = np.where( asset_list["Post Code"] == "CH1 4JR", "Not eligible", asset_list["ECO Eligibility"] ) # Same for CW8 3EU asset_list["ECO Eligibility"] = np.where( asset_list["Post Code"] == "CW8 3EU", "Not eligible", asset_list["ECO Eligibility"] ) asset_list["ECO Eligibility"] = np.where( asset_list["Post Code"] == "CW1 3HP", "Not eligible", asset_list["ECO Eligibility"] ) asset_list["ECO Eligibility"] = np.where( asset_list["Post Code"] == "WA4 2PH", "Not eligible", asset_list["ECO Eligibility"] ) asset_list["ECO Eligibility"] = np.where( asset_list["Post Code"] == "BD6 1QJ", "Not eligible", asset_list["ECO Eligibility"] ) asset_list["ECO Eligibility"] = np.where( asset_list["Post Code"] == "L39 1RS", "Not eligible", asset_list["ECO Eligibility"] ) asset_list["ECO Eligibility"] = np.where( asset_list["Post Code"] == "WA10 2DE", "Not eligible", asset_list["ECO Eligibility"] ) # Already surveyed under ECO4 asset_list["ECO Eligibility"] = np.where( asset_list["Post Code"] == "SK17 6NR", "Not eligible", asset_list["ECO Eligibility"] ) asset_list["ECO Eligibility"] = np.where( ((asset_list["Post Code"] == "WA5 0EN") & (asset_list["Address 1"] == "Block 17-26 Tavlin Avenue")), "Not eligible", asset_list["ECO Eligibility"] ) return asset_list @staticmethod def correct_ha14_asset_list(asset_list): # For 5 Queens Court, DE72 3NP, the postcode is actually DE72 3QZ asset_list.loc[ (asset_list["Address 1"] == "5 Queens Court") & (asset_list["Postcode"].str.strip() == "DE72 3NP"), "matching_postcode" ] = "DE72 3QZ" # We then correct the matching_address asset_list.loc[ (asset_list["Address 1"] == "5 Queens Court") & (asset_list["Postcode"].str.strip() == "DE72 3NP"), "matching_address" ] = "5 queens court, garfield avenue, draycott, derby, de72 3qz" return asset_list @staticmethod def correct_ha15_asset_list(asset_list): asset_list["matching_postcode"] = np.where( asset_list["Address Line 1"] == "103 Priory Crescent", "hp19 9ny", asset_list["matching_postcode"] ) return asset_list @staticmethod def correct_ha32_asset_list(asset_list): asset_list["Postcode"] = np.where( (asset_list["Street"] == "Norton Grove") & (asset_list["Postcode"] == "HU4 6HQ") & ( asset_list["Dwelling num"] == "7"), "hu4 6hg", asset_list["Postcode"] ) return asset_list @staticmethod def correct_ha38_asset_list(asset_list): # For Kingsford court, the house number is at the end of the address def rearrange_address_if_flat(address): if '/flat' in address.lower(): parts = address.split('/flat', 1) return f"FLAT{parts[1]}, {parts[0]}" return address def extract_house_no_if_flat(address): if '/flat' in address.lower(): # Attempt to extract the house number following "/flat" try: house_no = address.split('/flat ')[1].split(' ')[0] # Remove trailing comma house_no = house_no.replace(",", "") except IndexError: house_no = None return house_no return None asset_list['ExtractedHouseNo'] = asset_list['matching_address'].apply(extract_house_no_if_flat) asset_list.loc[asset_list['ExtractedHouseNo'].notnull(), 'HouseNo'] = asset_list['ExtractedHouseNo'] asset_list['matching_address'] = asset_list['matching_address'].apply(rearrange_address_if_flat) # We update a few specific rows asset_list["HouseNo"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/ROOM A1", "10 SOUTH VIEW/ROOM A2", "10 SOUTH VIEW/ROOM A3", ] )), "10A", asset_list["HouseNo"] ) asset_list["matching_address"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/ROOM A1", ] )), "10a, 10 south view/room a1, spennymoor, co. durham, dl16 7df'", asset_list["matching_address"] ) asset_list["HouseNo"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/ROOM B1", "10 SOUTH VIEW/ROOM B2", "10 SOUTH VIEW/ROOM B3", "10 SOUTH VIEW/ROOM B4", ] )), "10B", asset_list["HouseNo"] ) asset_list["matching_address"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/ROOM B1", ] )), "10b, 10 south view/room b1, spennymoor, co. durham, dl16 7df", asset_list["matching_address"] ) asset_list["HouseNo"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/FLAT C", ] )), "10C", asset_list["HouseNo"] ) asset_list["matching_address"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/FLAT C", ] )), "FLAT c, spennymoor, co. durham, dl16 7df, 10c, 10 south view", asset_list["matching_address"] ) asset_list["HouseNo"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/FLAT D", ] )), "10D", asset_list["HouseNo"] ) asset_list["matching_address"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/FLAT D", ] )), "FLAT d, spennymoor, co. durham, dl16 7df, 10d, 10 south view", asset_list["matching_address"] ) asset_list["HouseNo"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/FLAT E", ] )), "10E", asset_list["HouseNo"] ) asset_list["matching_address"] = np.where( (asset_list["Address_Line_1"].isin( [ "10 SOUTH VIEW/FLAT E", ] )), 'FLAT e, spennymoor, co. durham, dl16 7df, 10e, 10 south view', asset_list["matching_address"] ) return asset_list @staticmethod def correct_ha6_survey_list(survey_list): # Correct the survey list survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Seabridge Road", "Seabridge Lane" ) # Strip out /KNUTTON from the street name survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/KNUTTON", "") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Clevend Road", "Cleveland Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "TURNERS AVENUE", "Turner Avenue" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "WEDGEWWOD AVENUE", "Wedgwood Avenue" ) # The cherrytree record has wrong postcode survey_list.loc[survey_list["Street / Block Name"] == "Cherrytree road", "Post Code"] = "ST5 7BP" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "MONUMENT RD", "Monument Road" ) # Generally replace " RD" with " Road" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" RD", " Road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "HILARY Road", "Hillary Road" ) # Remove full stops from the street name survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(".", "") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Chatworth road", "Chatsworth Place" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Wood Croft", "Woodcroft" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Milstone Avenue", "Millstone Avenue" ) # Strip out /TALKE from the street name survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/TALKE", "") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Woodcutts Street", "Woodshutts Street" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "HILLARY AVENUE", "Hillary Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "HILLARY AVENUE", "Hillary Road" ) # Replace " Rd" with " Road" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" Rd", " Road") # We have a record listed as 19, MAPLE AVENUE ST7 1JX, when it should be 19, Hollins Crescent ST7 1JX survey_list.loc[ (survey_list["Street / Block Name"] == "MAPLE AVENUE") & (survey_list["NO."].isin([19])) & (survey_list["Post Code"] == "ST7 1JX"), "Street / Block Name" ] = "Hollins Crescent" # However, some of the maple avenue records, are indeed Maple avenue, but are listed with the wrong postcode. # E.g. number 26 survey_list.loc[ (survey_list["Street / Block Name"] == "MAPLE AVENUE") & (survey_list["NO."].isin([26])) & (survey_list["Post Code"] == "ST7 1JX"), "Post Code" ] = "ST7 1JW" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "BURSLEY Road", "Bursley Way" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Brittania Avenue", "Brittain Avenue" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Hawthorn Road", "Hawthorne Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Eastdale Place", "Easdale Place" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Wedgewood Road", "Wedgwood Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Droitwich Drive", "Droitwich Close" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Longdale Road", "Langdale Road" ) # We have 2 addresses in the survey list that don't have postcodes. We'll manually add them in survey_list.loc[ (survey_list["Street / Block Name"] == "Rogers Avenue") & pd.isnull(survey_list["Post Code"]), "Post Code" ] = "ST5 9AT" survey_list.loc[ (survey_list["Street / Block Name"] == "Cedar Road") & pd.isnull(survey_list["Post Code"]), "Post Code" ] = "ST5 7BY" # PERFORM ADDITIONAL DROPS # Dropping rows based on multiple conditions conditions_to_drop = [ (survey_list['Street / Block Name'] == "Bedford Crescent") & (survey_list['Post Code'] == "ST5 3EH") & ( survey_list['NO.'] == 23) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), (survey_list['Street / Block Name'] == "Hereford Avenue") & (survey_list['Post Code'] == "ST5 3EJ") & ( survey_list['NO.'] == 92) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), (survey_list['Street / Block Name'] == "Seabridge Lane") & (survey_list['Post Code'] == "ST5 3EX") & ( survey_list['NO.'].isin([16, 18, 42])) & ( survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), (survey_list['Street / Block Name'] == "ESKDALE PLACE") & (survey_list['Post Code'] == "ST5 3QW") & ( survey_list['NO.'] == 5) & (survey_list['SUBMISSION DATE'].astype(str) == "2023-03-06 00:00:00"), (survey_list['Street / Block Name'] == "Birch House road") & (survey_list['Post Code'] == "ST6 2LS") & ( survey_list['NO.'].isin([56, 58])), (survey_list['Street / Block Name'] == "Blackthorn Place") & (survey_list['Post Code'] == "ST6 2LS") & ( survey_list['NO.'].isin([37, 39])), (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 7BT") & ( survey_list['NO.'].isin([17, 6])), (survey_list['Street / Block Name'] == "Lion Grove") & (survey_list['Post Code'] == "ST5 7HQ") & ( survey_list['NO.'].isin([10, 12])) & ( survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), (survey_list['Street / Block Name'] == "DENRY CRESCENT") & (survey_list['Post Code'] == "ST5 8JW") & ( survey_list['NO.'] == 87) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), (survey_list['Street / Block Name'] == "HOLLINS CRESCENT") & (survey_list['Post Code'] == "ST7 1JW") & ( survey_list['NO.'] == 19) ] # Combine all conditions with an OR "|" combined_condition = np.logical_or.reduce(conditions_to_drop) # Drop rows that meet the combined condition survey_list = survey_list[~combined_condition] # Making replacements using np.where survey_list['Post Code'] = np.where( (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3EH") & ( survey_list['NO.'] == 17), "ST5 7BT", survey_list['Post Code'] ) survey_list['Post Code'] = np.where( (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3ED") & ( survey_list['NO.'] == 6), "ST5 7BT", survey_list['Post Code'] ) # Maple avenue (stoke on trent, not newcastle) should be st7 1jw survey_list["Post Code"] = np.where( (survey_list["Street / Block Name"].str.lower().str.contains("maple avenue")) & ( survey_list["Post Code"].str.lower() == "st7 1jx" ), "st7 1jw", survey_list["Post Code"] ) # Hollins Crescent should be st7 1jx survey_list["Post Code"] = np.where( (survey_list["Street / Block Name"].str.lower().str.contains("hollins crescent")) & ( survey_list["Post Code"].str.lower() == "st7 1jw" ), "st7 1jx", survey_list["Post Code"] ) # Additional drops as the above misses some: survey_list = survey_list[ ~((survey_list["NO."].astype(str).isin(["18", "42"])) & (survey_list["Street / Block Name"] == "Seabridge Lane") & (survey_list["Post Code"] == "ST5 3EY") & (survey_list["SUBMISSION DATE"].astype(str) == "24.07.2023") & (survey_list["INSTALLED OR CANCELLED"].str.contains("NO UPDATE YET"))) ] return survey_list @staticmethod def correct_ha14_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Godfrey Road", "Godfrey Drive" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Oiliver Road", "Oliver Road" ) # For postodes DE7 4FB, DE7 4EZ, it's actually spelled WINDERMERE AVENUE, not WINDEREMERE AVENUE (without the # extra e) survey_list.loc[ (survey_list["Street / Block Name"] == "WINDEREMERE AVENUE") & (survey_list["Post Code"].isin(["DE7 4FB", "DE7 4EZ"])), "Street / Block Name" ] = "WINDERMERE AVENUE" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "MACDONALD SQAURE", "MACDONALD SQUARE" ) return survey_list @staticmethod def correct_ha15_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive" ) return survey_list @staticmethod def correct_ha16_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower() survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == "REEDS RD", "Reeds ROAD", survey_list["Street / Block Name"] ) # Replace " rd " with "road" survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\brd\b', 'road', regex=True) # Replace " , " with ", " survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace( " , ", ', ', ) # Fix "{place} ,{place}" with "{place}, {place}" survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\s*,\s*', ', ', regex=True) # Strip whitespace survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip() # Correct errors survey_list["Post Code"] = np.where( survey_list["Post Code"] == "M38 0SA", "M38 9SA", survey_list["Post Code"] ) survey_list["Post Code"] = np.where( (survey_list["Street / Block Name"] == "nelson drive") & (survey_list["Post Code"] == "M44 5JE"), "M44 5JF", survey_list["Post Code"] ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road", "chatley road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road", "plantation avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("how clough drive", "howclough drive") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane", "brookhurst lane") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("biirch road", "birch road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hadson road", "hodson road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("harbonne avennue", "narbonne avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "cumberland road, cadishead", "cumberland avenue, cadishead") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("aston field drive", "ashton field drive") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wedgewood road", "wedgwood road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hamilton close", "hamilton avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "lichens crescent, fitton hill", "lichens crescent") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill", "south croft") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(", fitton hill", "") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("firtree dr", "fir tree avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hawthorne road", "hawthorn crescent") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue", "reins lee avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("westerhill road", "wester hill road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("st martins road", "saint martins road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("timperley avenue", "timperley close") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eastwood road", "eastwood avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("new road", "new street") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grassmere road", "grasmere road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hulton road", "hulton avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("beechfield avenue", "beechfield road") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue", "princes avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent", "edge fold crescent") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("conniston avenue", "coniston avenue") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent", "blackthorn crescent") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road", "wellstock lane") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brackley avenue", "brackley street") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brook avenue swinton", "brook avenue, swinton") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("green avenue swinton", "green avenue, swinton") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grasmere avenue wardley", "grasmere avenue, wardley") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle", "mardale avenue, wardle") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("carleach grove", "cartleach Grove") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("arbour grove", "arbor Grove") # Replacement for clively avenue 66-68 survey_list["NO."] = np.where( survey_list["NO."] == "66-68", "66", survey_list["NO."] ) # Delete some duplicated entries survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "york road") & (survey_list["NO."].astype(str) == "12") & (survey_list["Post Code"] == "M44 5HU") & (survey_list["SUBMISSION DATE"].astype(str) == "45229")) ] survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "peatfield avenue") & (survey_list["NO."].astype(str) == "23") & (survey_list["Post Code"] == "M27 9XG") & (survey_list["SUBMISSION DATE"].astype(str) == "45236")) ] return survey_list @staticmethod def correct_ha24_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ") survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower() survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip() survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "council house, nidds lane", "nidds lane" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "wirral avenue", "wirrall avenue" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "st ives road", "st. ives crescent" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "sundringham road", "sandringham road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "milton avenue", "milton road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "st ives crescent", "st. ives crescent" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "council house, waterbelly lane", "waterbelly lane" ) # Generally remove "councile house, " from the start of the street name survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "council house, ", "" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "st. leodegars close", "st leodegars close" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "montgomery crescent", "montgomery road" ) return survey_list @staticmethod def correct_ha28_survey_list(survey_list): # Rename the "No" column to "No." to align with the other survey sheets survey_list = survey_list.rename(columns={"NO ": "NO."}) survey_list["Post Code"] = np.where( survey_list["Post Code"] == "ME75HA", "ME7 5HA", survey_list["Post Code"] ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ANDREW MANOR/BRITTON ST", "ANDREW MANOR" ) survey_list["Post Code"] = np.where( survey_list["Post Code"] == "ME75TW", "ME7 5TW", survey_list["Post Code"] ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ST MARKS HOUSE/SAXON ST", "ST MARKS HOUSE" ) return survey_list @staticmethod def correct_ha38_survey_list(survey_list): # Rename the "No" column to "No." to align with the other survey sheets survey_list = survey_list.rename(columns={"NO ": "NO."}) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( 'Kingsford Court, Coombe Valley Road', 'Kingsford Court' ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( 'LESLIE TEW COURT/DERWENT ROAD', 'LESLIE TEW COURT' ) # There is no 18A LESLIE TEW COURT in the asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "LESLIE TEW COURT") & (survey_list["Post Code"] == "TN10 3TX") & (survey_list["NO."] == "18A")) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( 'Brindley House, Wellbeck Road', 'Brindley House' ) # Try taking just the first part of the string, splitting on a / survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split('/').str[0].str.strip() survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( 'HUNTSMAN WAY', 'HUNTSMANS WAY' ) # Try taking just the first part of the string, splitting on a , survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split(',').str[0].str.strip() survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "McCLAREN COURT", "MCLAREN COURT" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ST JAMES CLOISTERS", "ST. JAMES'S CLOISTERS" ) survey_list["Street / Block Name"] = np.where( ((survey_list["NO."].isin( [ "FLAT 1 22", "FLAT 2 22", "FLAT 3 22", "FLAT 4 22", "FLAT 5 22", "FLAT 6 22", ] )) & (survey_list["Street / Block Name"] == "MELTON ROAD")), "22 MELTON ROAD", survey_list["Street / Block Name"] ) survey_list["Street / Block Name"] = np.where( ((survey_list["NO."].isin( [ "FLAT 1 24", "FLAT 2 24", "FLAT 3 24", "FLAT 4 24", "FLAT 5 24", "FLAT 6 24", ] )) & (survey_list["Street / Block Name"] == "MELTON ROAD")), "24 MELTON ROAD", survey_list["Street / Block Name"] ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "TURRETT GREEN COURT SILENT STREET", "TURRET GREEN COURT" ) # Turret green court flat 1 doesn't exist in the asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "TURRET GREEN COURT") & (survey_list["NO."] == 1)) ] # 3, 45 raywell steet doesn't exist in the asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "45 RAYWELL STREET") & (survey_list["NO."] == 3)) ] # 40 Avondale drive doesn't exist in the asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Avondale Drive") & (survey_list["NO."] == 40)) ] # 17A beech road has the wrong postcode survey_list["Post Code"] = np.where( (survey_list["Street / Block Name"] == "BEECH ROAD") & (survey_list["Post Code"] == "DH6 1JD"), "DH6 1JB", survey_list["Post Code"] ) survey_list["Street / Block Name"] = np.where( (survey_list["Street / Block Name"] == "SOUTHVIEW") & (survey_list["Post Code"] == "DL16 7DF"), "SOUTH VIEW", survey_list["Street / Block Name"] ) survey_list["Post Code"] = np.where( (survey_list["Street / Block Name"] == "BEECH ROAD") & (survey_list["Post Code"] == "DH6 1JD"), "DH6 1JB", survey_list["Post Code"] ) return survey_list @staticmethod def correct_ha32_survey_list(survey_list): survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == "Coxwold", "Coxwold Grove", survey_list["Street / Block Name"] ) # Update the Barringhton Avenue with their correct spelling: Barrington Avenue survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == "Barringhton Avenue", "Barrington Avenue", survey_list["Street / Block Name"] ) # Update how the Rustenburn addresses are listed in the identified addresses survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == "Rustenburg", "Rustenburg Street", survey_list["Street / Block Name"] ) # Update how the MALIN LODGE, RONALDSWAY CLOSE addresses are listed in the identified addresses survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == "MALIN LODGE, RONALDSWAY CLOSE", "Malin Lodge", survey_list["Street / Block Name"] ) # Update how the Feroes Close are listed in the identified addresses survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == "Feroes Close", "Faroes Close", survey_list["Street / Block Name"] ) survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == 'FORESTER WAY', 'FORESTER WAY', survey_list["Street / Block Name"] ) survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == '6 Zeigfeld', 'Ziegfeld Court', survey_list["Street / Block Name"] ) # Malin Lodge, Ronaldsway Close survey_list["Street / Block Name"] = np.where( survey_list["Street / Block Name"] == 'Malin Lodge, Ronaldsway Close', 'Malin Lodge', survey_list["Street / Block Name"] ) return survey_list @staticmethod def correct_ha50_survey_list(survey_list): survey_list["Post Code"] = np.where( (survey_list["Street / Block Name"] == 'COSELEY STREET') & (survey_list["Post Code"] == 'ST16 1LR'), "ST6 1JU", survey_list["Post Code"] ) # Remove some of COSELEY STREET, as we have surveys done, outside of the asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "COSELEY STREET") & (survey_list["Post Code"] == "ST6 1JU") & (survey_list["NO."].isin([96]))) ] survey_list["Post Code"] = survey_list["Post Code"].str.replace("ST33JZ", "ST3 3JZ") # Remove some of Jesmond drive as we have surveys done outside of the asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Jesmond Drive") & (survey_list["Post Code"] == "ST3 3JZ") & (survey_list["NO."].isin([29]))) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "BRUNDELL OVAL", "BRUNDALL OVAL" ) # Remove 4 Linden Place survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Linden Place") & (survey_list["Post Code"] == "ST3 3AT") & (survey_list["NO."].isin([4]))) ] # Remove 11 Tilehurst Place survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Tilehurst Place") & (survey_list["Post Code"] == "ST3 3AP") & (survey_list["NO."].isin([11]))) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "deavile road", "DEAVILLE ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "WOOLISCROFT ROAD", "WOOLLISCROFT ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Leak Road", "Leek Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Springfield road", "Springfields road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "MILLWARD RD", "MILLWARD ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "REPINGTON RD", "REPINGTON ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ECCELSTONE PLACE", "ECCLESTONE PLACE" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "St. James Place", "St James Place" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "CHELL HEATH RD", "CHELL HEATH ROAD" ) # Correct postcode survey_list["Post Code"] = np.where( (survey_list["Street / Block Name"] == 'CHELL HEATH ROAD') & (survey_list["Post Code"] == 'ST6 6HU'), "ST6 6HJ", survey_list["Post Code"] ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Franklin Rd", "Franklin Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Lodge Rd", "Lodge Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "St Matthews Street", "St Matthew Street" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Grove Bank Road", "Grovebank Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "OVERSLEY RD", "OVERSLEY ROAD" ) # Replace all of the " RD" with " ROAD" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( " RD", " ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "St. Georges Crescent", "St Georges Crescent" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Tewson Road", "Tewson Green" ) # Remove 55 Seabridge Lane survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Seabridge Lane") & (survey_list["Post Code"] == "ST5 4AG") & (survey_list["NO."].isin([55]))) ] survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Tyne Way") & (survey_list["Post Code"] == "ST5 4AX") & (survey_list["NO."].isin([56]))) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "St.Bernards Place", "St Bernard Place" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Penarth Road", "Penarth Grove" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "St. Marys Road", "St Marys Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Larch Drive", "Larch Grove" ) # Drop 31 Lauder place north, as there is a duplicate. THis version also has a wrong postcode survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "LAUDER PLACE NORTH") & (survey_list["Post Code"] == "ST20QS") & (survey_list["NO."].isin([31]))) ] # Handle dropping of dupes survey_list["street_pruner"] = survey_list["Street / Block Name"].str.lower().str.replace(" ", "") survey_list["postcode_pruner"] = survey_list["Post Code"].str.lower().str.replace(" ", "") # Should go to 18 survey_list = survey_list.drop_duplicates(["NO.", "street_pruner", "postcode_pruner"]) survey_list = survey_list.drop(columns=["street_pruner", "postcode_pruner"]) return survey_list @staticmethod def correct_ha107_survey_list(survey_list): # Replace Front Street, East Stockham with Front Street, East Stockwith survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Front Street, East Stockham", "Front Street, East Stockwith" ) # Replace "HONEYHOLE L;ANE" with "HONEYHOLES LANE" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "HONEYHOLE L;ANE", "HONEYHOLES LANE" ) # Replace "Croft Lane Cherry Willingham, Lincoln" with "Croft Lane, Cherry Willingham, Lincoln" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Croft Lane Cherry Willingham, Lincoln", "Croft Lane, Cherry Willingham, Lincoln" ) # Replace "Snelland Road Wickenby, Lincoln" with "Snelland Road, Wickenby, Lincoln" survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln" ) # Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln" ) # Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln" ) # Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln" ) # Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln" ) # Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln" ) # Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln" ) # Replace SPRINKHILL ROAD with SPINKHILL ROAD survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "SPRINKHILL ROAD", "SPINKHILL ROAD" ) return survey_list @staticmethod def correct_ha41_survey_list(survey_list): return survey_list @staticmethod def correct_ha12_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Henstone Road", "Hanstone Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Lindern avenue", "Linden Avenue" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "priness way", "Princess Way" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Worth Crecesent", "Worth Crescent" ) survey_list["Post Code"] = survey_list["Post Code"].str.replace( "DY117HA", "DY11 7HA" ) survey_list["Post Code"] = survey_list["Post Code"].str.replace( "DY117HF", "DY11 7HF" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Adderbrook Crescent", "Addenbrooke Crescent" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Kinver Road", "Kinver Avenue" ) return survey_list @staticmethod def correct_ha13_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Woodfarm Road", "WOOD FARM ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ALLANDALE ROAD", "ALLANDALE" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "NEWFIELDS LANE", "NEWFIELD LANE" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "BROADFIELDS ROAD", "BROADFIELD ROAD" ) survey_list["Post Code"] = survey_list["Post Code"].str.replace( "HP2 5SF+", "HP2 5SF" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "PESCOTT HILL", "PESCOT HILL" ) # This is a duplicate record survey_list = survey_list[ ~((survey_list["NO."] == 33) & (survey_list["Street / Block Name"] == "Turners Hill") & (survey_list["Post Code"] == "HP2 4LH") & (survey_list["INSTALLED OR CANCELLED"] == "NO UPDATE - CHECKED 18.12.23")) ] return survey_list @staticmethod def correct_ha18_survey_list(survey_list): return survey_list @staticmethod def correct_ha35_survey_list(survey_list): return survey_list @staticmethod def correct_ha34_survey_list(survey_list): # Note in the asset list survey_list = survey_list[ survey_list["Post Code"] != "L5 3SS" ] survey_list["Post Code"] = survey_list["Post Code"].str.replace( "L177DR", "L17 7DR" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "PENVALLEY CRESENT", "Penvalley Crescent" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "PENLINKEN DRIVE", "Penlinken Drive" ) # There's no 32 Penlinken Drive in the asset sheet survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Penlinken Drive") & (survey_list["NO."] == 32)) ] # There's no 30 Gwent Street in the asset sheet survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "GWENT ST") & (survey_list["NO."] == 30)) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "POULTON RD", "Poulton Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ST PAULS RD", "St Pauls Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "BROAD LANE, KIRKBY", "BROAD LANE" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "BULLENS RD, KIRKBY", "Bullens Road" ) # There's no 219 NORTH HILL ST in the asset sheet survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "NORTH HILL ST") & (survey_list["NO."] == 219)) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "CROSLAND RD, KIRKBY", "CROSLAND ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "PARK BROW DRIVE, KIRKBY", "Park Brow Drive" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "CELTIC TREET", "Celtic Street" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "BUCKLAND ROAD", "Buckland Street" ) # duplicates survey_list = survey_list.drop_duplicates(["Street / Block Name", "NO.", "Post Code"]) # This is a duplicate with wrong postcode survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "CLARIBEL STREET") & (survey_list["NO."] == 7) & (survey_list["Post Code"] == "L8 8AF")) ] survey_list["NO."] = np.where( ((survey_list["NO."] == "187 A") & (survey_list["Post Code"] == "L32 6QF")), "187A", survey_list["NO."] ) return survey_list @staticmethod def correct_ha56_survey_list(survey_list): # Not in asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Samual Street") & (survey_list["NO."].isin([22, 24])) & (survey_list["Post Code"] == "WA5 1BB")) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "STOURTON RD", "Stourton Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "BIRKIN RD", "Birkin Road" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "PORTLAND RD", "Portland Road" ) # We remove a row, because two rows match to a block listing survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Tavlin Avenue") & (survey_list["NO."] == 17) & (survey_list["Post Code"] == "WA5 0EN")) ] return survey_list @staticmethod def correct_ha30_survey_list(survey_list): survey_list = survey_list[~pd.isnull(survey_list["Post Code"])] # Split on / and take the first half survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0] # Not in the asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Horsebridge Road") & (survey_list["NO."] == 286)) ] survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "DUTTON WAY") & (survey_list["NO."] == 9)) ] survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "PAYTHORNE CLOSE") & (survey_list["NO."] == 10)) ] survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "MARCHWOOD ROAD") & (survey_list["NO."] == 11)) ] survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Otterburn Close") & (survey_list["NO."] == 4)) ] survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Blossom Court") & (survey_list["NO."] == 5)) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "St LUKES CLOSE , HUNTINGDON", "St. Lukes Close" ) survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "St. Lukes Close") & (survey_list["NO."].isin([4, 7, 8]))) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ROMAN WAY , GODMANCHESTER , HUNTINGDON", "Roman Way" ) survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Roman Way") & (survey_list["NO."].isin([58]))) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "HEADLANDS , FENSTANTON , HUNTINGDON", "Headlands Fenstanton" ) survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Headlands Fenstanton") & (survey_list["NO."].isin([126, 134]))) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "WALLACE COURT , HUNTINGDON", "Wallace Court" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "CRICKETERS WAY , CHATTERIS", "Cricketers Way" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Jubilee Gardens", "Jubilee Green" ) survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "Harrow Road") & (survey_list["NO."].isin([10]))) ] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ST LUKES CLOSE", "St. Lukes Close" ) return survey_list @staticmethod def correct_ha49_survey_list(survey_list): return survey_list @staticmethod def correct_ha8_survey_list(survey_list): # Split on / and take the first half survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0] survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "WESTONIA COURT HOUSE", "Westonia Court" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Hillesdon Avenue", "Hillesden Avenue" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Weston Street", "Western Street" ) # Remove placeholder rows where postcode is missing survey_list = survey_list[ ~pd.isnull(survey_list["Post Code"]) ] return survey_list @staticmethod def correct_ha11_survey_list(survey_list): # Remove 39 HOLLYWOOD WAY as it's not in the asset list survey_list = survey_list[ ~((survey_list["Street / Block Name"] == "HOLLYWOOD WAY") & (survey_list["NO."] == 39)) ] return survey_list @staticmethod def correct_ha42_survey_list(survey_list): # original asset list has nothing in the street survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Turnstone Terrace", "" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Pegasus place", "" ) return survey_list @staticmethod def correct_ha45_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Norwich Road", "Norwich Avenue" ) return survey_list @staticmethod def correct_ha51_survey_list(survey_list): survey_list = survey_list.rename(columns={"NO ": "NO."}) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Autum Close", "Autumn Close" ) return survey_list @staticmethod def correct_ha52_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Mardalle Avenue", "Mardale Avenue" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Ollerton Close, Grappenhall", "Ollerton Close" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Bradshaw Road, Grappenhall", "Bradshaw Lane" ) # Drop a bunch of dupes survey_list = survey_list.drop_duplicates(["NO.", "Street / Block Name", "Post Code"]) return survey_list @staticmethod def correct_ha5_survey_list(survey_list): return survey_list @staticmethod def correct_ha20_survey_list(survey_list): # Not in the asset list survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Abbot Close", "ABBOTS CLOSE" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Downbarns Road", "DOWN BARNS ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "Austin Lane", "AUSTINS LANE" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "South Park Way", "SOUTHPARK WAY" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "OAKLAND ROAD", "OAKWOOD ROAD" ) survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "ACRE WAY/NORTHWOOD", "ACRE WAY" ) return survey_list @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() # Strip out punctuation and spaces match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to] match_to = [x.replace(" ", "") for x in match_to] # Perform matching between full key and match_to distances = [100 - fuzz.ratio(matching_string, s) for s in match_to] best_match_index = distances.index(min(distances)) # We might want to consider a threshold for the distance, however for the momeny, # we don't consider this for the moment df = df.iloc[best_match_index:best_match_index + 1] return df def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): # Correct the survey list survey_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_survey_list") survey_list = survey_list_correction_function(survey_list) missed_postcodes = [] if ha_name in ["HA6", "HA34"]: missed_postcodes = [ postcode.lower() for postcode in survey_list["Post Code"] if postcode.lower() not in asset_list["matching_postcode"].values ] if ha_name == "HA13": missed_postcodes = ["hp17 8le"] if ha_name == "HA56": # Multiple properties are listed as blocks, which is a problem for matching missed_postcodes = ["sk17 6nr", "wa5 0en"] matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] if isinstance(house_number, str): house_number = house_number.lower().strip() # Filter on the first line of the address df = asset_list[ asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip()) ].copy() if not any(df["matching_address"].str.contains(str(house_number))): if "flat" in str(house_number): house_number = house_number.split("flat")[1].strip() # We check if we had an instance of flat x, y if "," in str(house_number): house_number = house_number.split(",")[0].strip() # We may also have a space for an instance of flat x y if " " in str(house_number): house_number = house_number.split(" ")[0].strip() df = df[df["matching_address"].str.contains(str(house_number))] if df.empty: postcode_lower = row["Post Code"].lower() if postcode_lower in missed_postcodes: matching_lookup.append( { "survey_list_row_id": row["survey_list_row_id"], "asset_list_row_id": None, } ) continue print(row["Street / Block Name"]) print(house_number) print(row["Post Code"]) raise ValueError("Investigate") if df.shape[0] != 1: df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] if df.empty: postcode_lower = row["Post Code"].lower() if postcode_lower in missed_postcodes: matching_lookup.append( { "survey_list_row_id": row["survey_list_row_id"], "asset_list_row_id": None, } ) continue if df.shape[0] != 1: if "Town/Area" not in row.keys(): full_key = (str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + row["Post Code"].lower().strip()) else: full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + \ row["Town/Area"].lower().strip() + row["Post Code"].lower().strip() # Remove any spaces from the full key full_key = full_key.replace(" ", "") df = self.levenstein_match(full_key, df) if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) print(row["Post Code"]) raise ValueError("Investigate") matching_lookup.append( { "survey_list_row_id": row["survey_list_row_id"], "asset_list_row_id": df["asset_list_row_id"].values[0], } ) matching_lookup = pd.DataFrame(matching_lookup) if matching_lookup.shape[0] != survey_list.shape[0]: raise ValueError("Mismatch in the number of survey rows and matching lookup rows") matching_lookup = matching_lookup[~pd.isnull(matching_lookup["asset_list_row_id"])] if matching_lookup["asset_list_row_id"].duplicated().sum(): raise ValueError("Duplicated matches in survey list") # Merge onto the survey list survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id") # TEMP FOR NEWER WORK # matching_lookup = matching_lookup.merge( # asset_list[["asset_list_row_id", "UPRN"]], how="left", on="asset_list_row_id" # ).merge( # survey_list[["survey_list_row_id", "NO.", "Street / Block Name", "Post Code"]], # how="left", on="survey_list_row_id" # ) # matching_lookup.to_csv( # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane/surveys_to_assets.csv" # ) return survey_list @staticmethod def correct_ha25_eco3_list(eco3_list): # NEADS DRIVE, postcode with bs305dt, is not found in the asset list eco3_list = eco3_list[ ~(eco3_list["Post Code"] == "BS305DT") ] # Drop rows with missings postcode eco3_list = eco3_list[ ~pd.isnull(eco3_list["Post Code"]) ] # We have a bunch of genuine duplicates eco3_list = eco3_list.drop_duplicates(["NO ", "Street / Block Name", "Post Code"]) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "HALWILL MEADOOW", "HALWILL MEADOW" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "Hall Road", "Hall Rd" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "SPRINGFIELD WAY SAINT DAY", "SPRINGFIELD WAY ST DAY" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "BOND SPEAR COURT", "BOND-SPEAR COURT" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "ST.MARYS HILL", "ST MARYS HILL" ) # Correct the postcode for edmund road eco3_list["Post Code"] = np.where( (eco3_list["Street / Block Name"] == "EDMUND ROAD") & (eco3_list["Post Code"] == "TR14 8QJ"), "TR15 1BY", eco3_list["Post Code"] ) return eco3_list @staticmethod def correct_ha50_eco3_list(eco3_list): return eco3_list @staticmethod def correct_ha41_eco3_list(eco3_list): return eco3_list @staticmethod def correct_ha63_eco3_list(eco3_list): eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])] # Some postcode that aren't in the asset list eco3_list = eco3_list[ ~eco3_list["Post Code"].isin( ["NR32 15X", "NR30 2BT"] ) ] eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "POUND COTTAGES - BLOOMSBERRY CLOSE", "POUND COTTAGES" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "FREDRICK ROAD", "Frederick Road" ) # For denmark street, remove the space from the house number eco3_list["NO "] = np.where( eco3_list["Street / Block Name"] == "DENMARK STREET", eco3_list["NO "].str.replace(" ", ""), eco3_list["NO "] ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "OLD HOSPITAL MEWS HOSPITAL WALK", "Old Hospital Mews" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "Portland House, Portland Street", "Portland House" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "MIDDLE MARKET STREET", "Middle Market Road" ) return eco3_list @staticmethod def correct_ha117_eco3_list(eco3_list): # Delete rows where postcode is null - there are some placeholder rows where this happens eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])] eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "TARRING ROAD", "155 TARRING ROAD" ) return eco3_list @staticmethod def correct_ha56_eco3_list(eco3_list): eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])] eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "Mount Pleasant, Crewe", "Mount Pleasant" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "Dutton Close", "Dutton Way" ) eco3_list["Post Code"] = eco3_list["Post Code"].str.replace( "Ls63nl", "LS6 3NL" ) # Handle a duplicate eco3_list = eco3_list[ ~((eco3_list["Street / Block Name"] == "Mount Pleasant") & (eco3_list["Post Code"] == "CW1 3JF") & (eco3_list["NO "] == 5) & (eco3_list["INSTALL/ CANCELLATION DATE"] == "CANCELLED 20.5.2022")) ] return eco3_list @staticmethod def correct_ha51_eco3_list(eco3_list): eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "HASELEMERE AVENUE", "HASLEMERE AVENUE" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "THORVILLE GROVE", "THORNVILLE GROVE" ) eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( "MONTBRETA CLOSE", "MONTBRETIA CLOSE" ) eco3_list["Post Code"] = np.where( (eco3_list["Street / Block Name"] == "SYDENHAM ROAD") & (eco3_list["Post Code"] == "CR0 2DW"), "CR0 2ED", eco3_list["Post Code"] ) # Not in asset list eco3_list = eco3_list[ ~((eco3_list["Street / Block Name"] == "WOODLEY LANE") & (eco3_list["Post Code"] == "SM5 2RJ") & (eco3_list["NO "] == "FLAT 3, 11")) ] eco3_list["NO "] = np.where( (eco3_list["NO "] == "47 B"), "47B", eco3_list["NO "] ) return eco3_list def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") eco3_list = eco3_list_correction_function(eco3_list) asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower() eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "") if ha_name in ["HA25", "HA56", "HA51"]: # HA25: 317 -> 259 missed_postcodes = { postcode for postcode in eco3_list["postcode_no_space"] if postcode not in asset_list["matching_postcode_nospace"].values } eco3_list = eco3_list[~eco3_list["postcode_no_space"].isin(missed_postcodes)] # For the asset list, we create a matching address without any punctuation # TODO: We should generally just remove puncutation from addresses when matching asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace( r'[^\w\s]', '', regex=True ) # Remove double spaces asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace( " ", " " ) matching_lookup = [] missed = [] for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): # if row["eco3_list_row_id"] == "HA51_Eco3_22": # raise Exception() postcode = row["postcode_no_space"] # df will never be empty, since we've already done a check for common postcodes df = asset_list[ asset_list["matching_postcode_nospace"].str.contains(postcode) ] house_number = row["NO "] if isinstance(house_number, str): house_number = house_number.lower().strip() if not any(df["HouseNo"].str.contains(str(house_number))): if "flat" in str(house_number): house_number = house_number.split("flat")[1].strip() # We check if we had an instance of flat x, y if "," in str(house_number): house_number = house_number.split(",")[0].strip() # We may also have a space for an instance of flat x y if " " in str(house_number): house_number = house_number.split(" ")[0].strip() # We must do the house number filter df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] # Perform a search on streetname # We do this to prevent duplicate matches to properties with the same postcode and house number, # but different streets street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0] street_name_section1 = re.sub(r'[^\w\s]', '', street_name_section1) df = df[df["matching_address_no_punctuation"].str.contains(street_name_section1)] if df.empty: missed.append(row["eco3_list_row_id"]) continue if df.shape[0] > 1: if "flat" in str(row["NO "]).lower(): df = df[df["matching_address"].str.contains("flat")] else: df = df[~df["matching_address"].str.contains("flat")] if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) print(row["Post Code"]) raise ValueError("Investigate") matching_lookup.append( { "eco3_list_row_id": row["eco3_list_row_id"], "asset_list_row_id": df["asset_list_row_id"].values[0], } ) # We verify the missed # HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2 # where many surveys were conducted on house numbers, not in the asset list # 154 missed, 2827 matched for HA 25 # For HA56, the number of missed is high at 320, however a big portion of these are due to the block being # listed in the asset list, and individual units being in the survey list if len(missed) != self.UNMATCHED_ECO3[ha_name]: raise ValueError( f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on if matching_lookup["asset_list_row_id"].duplicated().sum(): raise ValueError("Duplicated asset list row ids") # Merge onto eco3 list eco3_list = eco3_list.merge(matching_lookup, how="left", on="eco3_list_row_id") asset_list.drop(columns=["matching_address_no_punctuation"], inplace=True) return eco3_list @staticmethod def extract_streetname(address, house_number=None, postcode=None): """ Cleans an address by removing the house number and postcode, and converts everything to lower case. :param address: The full address as a string. :param house_number: The house number to remove, as a string or integer. :param postcode: The postcode to remove, as a string. :return: The cleaned address. """ # Convert everything to lower case address = address.lower() if house_number is not None: # Remove the house number address = re.sub(r'\b{}\b'.format(house_number), '', address, flags=re.IGNORECASE).strip() if postcode is not None: # Remove the postcode address = re.sub(r'\b{}\b'.format(re.escape(postcode)), '', address, flags=re.IGNORECASE).strip() # Get first section before a comma address = address.split(",")[0] # Additional cleaning to remove extra spaces and commas left over address = re.sub(r'\s+', ' ', address) # Replace multiple spaces with a single space address = re.sub(r'\s*,\s*', ', ', address) # Clean up space around commas return address def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name): matching_lookup = [] unmatched_addresses = [] for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)): house_number = row["HouseNo"] if isinstance(house_number, str): house_number = house_number.lower().strip() # Filter on the postcode df = asset_list[ asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip()) ].copy() df = df[df["HouseNo"].astype(str) == str(house_number)] # For ciga, we skip if df.empty: unmatched_addresses.append( { "ciga_list_row_id": row["ciga_list_row_id"], "HouseNo": house_number, "Matched Postcode": row["Matched Postcode"] } ) continue if df.shape[0] != 1: # We split house number and postcode out of the matched address for ciga street_name = self.extract_streetname( address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"] ) # We check if any of the rows contains the street name and if they do, filter if any(df["matching_address"].str.replace(",", "").str.contains(street_name)): df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)] if df.shape[0] != 1: # The final check we do here is to check for the presence of flat in the address if "flat" in row["Matched Address"].lower(): df = df[df["matching_address"].str.contains("flat")] else: df = df[df["matching_address"].str.contains("flat") == False] if df.shape[0] != 1: full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[ "Matched Postcode"].lower().strip() # Remove any spaces from the full key full_key = full_key.replace(" ", "") df = self.levenstein_match(full_key, df) if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) print(row["Post Code"].lower()) raise ValueError("Investigate") matching_lookup.append( { "ciga_list_row_id": row["ciga_list_row_id"], "asset_list_row_id": df["asset_list_row_id"].values[0], } ) # We have an acceptable number of ciga failures for each HA if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]: raise ValueError( f"Unmatched addresses for {ha_name} is not as expected, got {len(unmatched_addresses)} unmatched") matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on if matching_lookup["asset_list_row_id"].duplicated().any(): raise ValueError("Duplicated asset list row ids") # Merge onto the ciga list ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id") return ciga_list @staticmethod def identify_built_form_ha6(property_string): """ Identify the built form of a property from the given string. :param property_string: The string describing the property :return: The identified built form, or None if it cannot be identified """ # Define keywords for each built form built_forms = { 'Semi-Detached': ['semi detached'], 'Detached': ['detached'], 'Mid-Terrace': ['mid terrace', 'mid town house'], 'End-Terrace': ['end terrace', 'end town house'] } # Normalize the input string to lower case for comparison property_string_normalized = property_string.lower() # Search for each built form keyword in the input string for built_form, keywords in built_forms.items(): for keyword in keywords: if keyword in property_string_normalized: return built_form # Return None if no built form is identified return None def load(self): # Get the december figures, which is just a csv self.december_figures = pd.read_csv(self.december_figures_filepath) # Remove the spaces in HA Name self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "") for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]: self.december_figures[col] = self.december_figures[col].astype("Int64") if self.use_cache and not self.rebuild: data = read_pickle_from_s3( bucket_name="retrofit-datalake-dev", s3_file_name="ha-analysis/batch3-inputs.pickle", ) else: data = {} for filepath in self.directories: ha_name = filepath.split("/")[2] if ha_name in data: continue # Load asset list logger.info("Loading data for {}".format(ha_name)) asset_list, survey_list, ciga_list, eco3_list = self.load_asset_list( filepath=filepath, ha_name=ha_name, ) data[ha_name] = { "asset_list": asset_list, "survey_list": survey_list, "ciga_list": ciga_list, "eco3_list": eco3_list } self.data = data # Cache the data in s3 # We need to pickle the data and store in s3 save_pickle_to_s3( data=self.data, bucket_name="retrofit-datalake-dev", s3_file_name="ha-analysis/batch3-inputs.pickle", ) def ha_facts_and_figures(self): """ This function will return a dictionary of facts and figures for each HA :return: """ scheme_map = { "ECO4": "ECO4", "AFFORDABLE WARMTH": "ECO4", "ECO4 A/W": "ECO4", "ECO4 GBIS (ECO+)": "GBIS", "ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS", "ECO4 AFFORDABLE WARMTH": "ECO4", "Affordable Warmth": "ECO4", "ECO4 GBIS (ECO+) JJC UNDER 73m² ": "GBIS", "ECO4 PPS": "ECO4", "AFFORDABLE WARMTH / REMEDIAL": "ECO4", "AFF0RDALE WARMTH": "ECO4", "ECO 4 RdSAP CL": "ECO4", "Affordable Warmth (R) ": "ECO4", "Affordable Warmth ": "ECO4", "ECO 4 AFFORDABLE WARMTH": "ECO4", } # Since it seems like "subject to archetype check" has some failure conditions, for simplicity, we # treat these as similar to subject to CIGA, and therefore unconfirmed worked that could fail. There # are only a small volume of properties for which we see this eco_eligibility_map = { "not eligble": "not eligible", "eco 4(subject to ciga)": "eco4 (subject to ciga)", "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga) (subject to archetype)", "eco4 (subject to archetype check)": "eco4 (subject to archetype)", "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)", "eco4 (subject to ciga)": "eco4 (subject to ciga)", "eco4(subject to ciga)": "eco4 (subject to ciga)", "eco4 subject to ciga": "eco4 (subject to ciga)", "eco4 (subject to archetype/ciga)": "eco4 (subject to ciga) (subject to archetype)", "eco4( subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)", "eco4 (subject to ciga/ archetype)": "eco4 (subject to ciga) (subject to archetype)", } ha_facts_and_figures = [] for ha_name, data_assets in self.data.items(): asset_list = data_assets["asset_list"].copy() survey_list = data_assets["survey_list"].copy() ciga_list = data_assets["ciga_list"].copy() eco3_list = data_assets.get("eco3_list", pd.DataFrame()) asset_list_starting_size = asset_list.shape[0] # Change the column name if it's ECO eligibility asset_list = asset_list.rename( columns={ "ECO eligibility": "ECO Eligibility", "ECO Eligibilty": "ECO Eligibility", }, ) # Remove surplus whitespace from the ECO Eligibility column asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.strip() # Push to lower case asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.lower() # Remap asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].replace(eco_eligibility_map) if not ciga_list.empty: # We merge on ciga and update the status to reflect if it has failed ciga or not # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA # check ciga_list_to_merge = ciga_list[["asset_list_row_id", "Guarantee"]].copy() ciga_list_to_merge = ciga_list_to_merge[~pd.isnull(ciga_list_to_merge["asset_list_row_id"])] asset_list = asset_list.merge(ciga_list_to_merge, how='left', on="asset_list_row_id") asset_list["ECO Eligibility"] = np.where( ( asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) & (asset_list["Guarantee"] == "Yes") ), "failed ciga", asset_list["ECO Eligibility"] ) # We replace any remaining "Subject to CIGA" with pass Ciga asset_list["ECO Eligibility"] = np.where( ( asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) & (asset_list["Guarantee"] == "No") ), "eco4 - passed ciga", asset_list["ECO Eligibility"] ) asset_list = asset_list.drop(columns=["Guarantee"]) # Update the asset list with the categorisations and rename changes if asset_list.shape[0] != asset_list_starting_size: raise ValueError("The asset list has changed in size") # If we have eco3 surveys, we set a property to not eligible if not eco3_list.empty: eco3_list_to_merge = eco3_list[["asset_list_row_id"]].copy() eco3_list_to_merge["has_eco3"] = True asset_list = asset_list.merge( eco3_list_to_merge, how="left", on="asset_list_row_id" ) if asset_list.shape[0] != asset_list_starting_size: raise ValueError("The asset list has changed in size, when merging on eco3") # Any rows that have an eco3 survey are set to not eligible asset_list["ECO Eligibility"] = np.where( asset_list["has_eco3"] == True, "not eligible", asset_list["ECO Eligibility"] ) # asset_list = asset_list.drop(columns=["has_eco3"]) # Report on sales sales_report = {} if not survey_list.empty: scheme_column = survey_list.columns[0] # Remap the values in the scheme column survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map) # We clean up the survey list installation or cancelled if "INSTALLED OR CANCELLED" in survey_list.columns: survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() # Remove all punctuation survey_list["installed_or_cancelled_clean"] = survey_list[ "installed_or_cancelled_clean"].str.replace( r'[^\w\s]', '', regex=True ) # Remove double spaces survey_list["installed_or_cancelled_clean"] = survey_list[ "installed_or_cancelled_clean"].str.replace( r'\s+', ' ', regex=True ) # Remove trailing spaces survey_list["installed_or_cancelled_clean"] = survey_list[ "installed_or_cancelled_clean"].str.strip() survey_list["installation_status"] = None survey_list["installation_status"] = np.where( survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), "installed", survey_list["installation_status"] ) survey_list["installation_status"] = np.where( survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), "cancelled", survey_list["installation_status"] ) # Find partial installations survey_list["installation_status"] = np.where( survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), "in progress", survey_list["installation_status"] ) # Find partial cancellations # TODO: We might have more indications of partial cancellations survey_list["installation_status"] = np.where( survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), "cancelled", survey_list["installation_status"] ) else: # We have some examples, e.g. HA28, where we do not have the installed or cancelled column if 'INSTALL/ CANCELLATION DATE' in survey_list.columns: survey_list["installation_status"] = np.where( survey_list['INSTALL/ CANCELLATION DATE'].str.lower().str.contains("cancelled"), "cancelled", "installed", ) else: survey_list["installation_status"] = np.where( survey_list['INSTALL / CANCELLATION DATE'].str.lower().str.contains("cancelled"), "cancelled", "installed", ) # Finally, for other cases, we set the status to "in progress" survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") # We concatenate the scheme name with the installation status survey_list["installation_status"] = ( survey_list[scheme_column] + " - " + survey_list["installation_status"] ) # We get the sales sales_report = { "ECO4 - surveys sold": survey_list.shape[0], **survey_list["installation_status"].value_counts().to_dict() } # We find some cases where properties have sold but are missing CIGA checks survey_list_to_merge = survey_list[["asset_list_row_id", "installation_status"]].copy() survey_list_to_merge["has_a_survey_record"] = True survey_list_to_merge = survey_list_to_merge[~pd.isnull(survey_list_to_merge["asset_list_row_id"])] asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id") # Update the cases where properties have sold, but are missing a CIGA check # If we don't have a CIGA list, we set the value to ECO4 set_to = "eco4 - passed ciga" if not ciga_list.empty else "eco4" asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"].str.contains("subject to ciga")) & ( asset_list["has_a_survey_record"] == True ), set_to, asset_list["ECO Eligibility"] ) # Update the cases where a property has been marked as eligible for GBIS, but sold for ECO4 asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"] == "gbis") & ( asset_list["installation_status"].isin( ["ECO4 - installed", "ECO4 - cancelled", "ECO4 - in progress"] ) ), "eco4", asset_list["ECO Eligibility"] ) # Update the cases where a property was marked as eligible for ECO4, but sold for GBIS asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"].isin( [ "eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga", "eco4 (subject to archetype)", "eco4 (subject to ciga) (subject to archetype)" ] )) & ( asset_list["installation_status"].isin( ["GBIS - installed", "GBIS - cancelled", "GBIS - in progress"] ) ), "gbis", asset_list["ECO Eligibility"] ) # Update the cases where a property is marked as not eligible, but sold for GBIS asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"] == "not eligible") & ( asset_list["installation_status"].isin( ["GBIS - in progress", "GBIS - installed", "GBIS - cancelled"] )), "gbis", asset_list["ECO Eligibility"] ) # Update the cases where a property is marked as not eligible, but sold for ECO4 asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"] == "not eligible") & ( asset_list["installation_status"].isin( ["ECO4 - in progress", "ECO4 - installed", "ECO4 - cancelled"] ) ), "eco4", asset_list["ECO Eligibility"] ) asset_list = asset_list.drop(columns=["has_a_survey_record", "installation_status"]) # Update the survey list with installation status self.data[ha_name]["survey_list"] = survey_list # Insert updated asset list self.data[ha_name]["asset_list"] = asset_list ha_facts_and_figures.append( { "HA Name": ha_name, **asset_list["ECO Eligibility"].value_counts().to_dict(), **sales_report } ) ha_facts_and_figures = pd.DataFrame(ha_facts_and_figures) ha_facts_and_figures = ha_facts_and_figures.drop( columns=["not eligible"] ) ha_facts_and_figures = ha_facts_and_figures.fillna(0) # Make all columns apart from HA NAme integers for col in ha_facts_and_figures.columns[1:]: ha_facts_and_figures[col] = ha_facts_and_figures[col].astype(int) ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name") ha_facts_and_figures = ha_facts_and_figures.fillna(0) self.facts_and_figures = ha_facts_and_figures def get_property_type_and_built_form(property_meta, ha_name): if ha_name in ["HA44"]: return None, None if ha_name == "HA1": property_type = property_meta["Asset Type"] # We correct a small error if property_type == "a": property_type = "House" # Remap bedsits to flats if property_type in ["Bedsit", "Room"]: property_type = "Flat" built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"], None) elif ha_name == "HA2": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type"].strip()) built_form = None elif ha_name == "HA5": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) built_form = None elif ha_name == "HA6": property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]] built_form = property_meta["built_form"] elif ha_name == "HA7": property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["Archetype"]) built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"]) elif ha_name == "HA8": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA9": property_description = property_meta["Asset Type"].strip().lower() if "house" in property_description: return "House", None if "flat" in property_description: return "Flat", None if "bungalow" in property_description: return "Bungalow", None if "maisonette" in property_description: return "Maisonette", None return None, None elif ha_name == "HA11": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA12": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset_Type1"].strip()) built_form = None elif ha_name == "HA13": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Type Cd"].strip()) built_form = None elif ha_name == "HA14": if property_meta["Asset Type Description"] == "Block - Repair": # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address if "room" in property_meta["Address 1"].lower(): property_type = "House" else: property_type = "Flat" else: property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][ property_meta["Asset Type Description"] ] built_form = None elif ha_name == "HA15": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA16": config = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Type"]] property_type = config.get("property-type") built_form = config.get("built-form") elif ha_name == "HA17": return property_meta["property_type"], None elif ha_name == "HA18": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) built_form = None elif ha_name == "HA19": property_type = property_meta["Dwelling Type"] built_form = None elif ha_name == "HA20": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) built_form = None elif ha_name == "HA21": property_description = property_meta["Property Type"].strip().lower() if "house" in property_description: return "House", None if "flat" in property_description: return "Flat", None if "bungalow" in property_description: return "Bungalow", None if "maisonette" in property_description: return "Maisonette", None return None, None elif ha_name == "HA24": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA25": property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["T1_AssetType"]] built_form = None elif ha_name == "HA27": property_type = property_meta["Property Type"] built_form = None elif ha_name == "HA28": property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Property Type - Academy"]] built_form = None elif ha_name == "HA30": property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["A_AssetType"]] built_form = None elif ha_name == "HA31": property_description = property_meta["A_AssetType"].strip().lower() if "house" in property_description: return "House", None if "flat" in property_description: return "Flat", None if "bungalow" in property_description: return "Bungalow", None if "maisonette" in property_description: return "Maisonette", None return None, None elif ha_name == "HA32": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling type"].strip()) built_form = None elif ha_name == "HA34": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA35": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type Grouping"].strip()) built_form = None elif ha_name == "HA37": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PROPERTY TYPE"].strip()) built_form = None elif ha_name == "HA39": property_type_config = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["ConstructionStyle"], {}) property_type = property_type_config.get("property_type", None) built_form = property_type_config.get("built_form", None) if property_type is None: # We check for the presence of room or flat if "flat" in property_meta["matching_address"]: property_type = "Flat" else: property_type = "House" elif ha_name == "HA41": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Archetype"].strip()) built_form = None elif ha_name == "HA42": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling use/type"].strip()) built_form = None elif ha_name == "HA45": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property type"].strip()) built_form = None elif ha_name == "HA48": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA49": property_type = property_meta["Property Class"].strip() built_form = None elif ha_name == "HA50": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA51": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) built_form = None elif ha_name == "HA52": if property_meta["Property Type"] is None: return None, None property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA54": property_type = property_meta["Property Type"] built_form = None elif ha_name == "HA56": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type Description"].strip()) built_form = None elif ha_name == "HA63": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PropertyType"].strip()) built_form = None elif ha_name == "HA107": property_type = property_meta.get("property_type", None) built_form = property_meta.get("built_form", None) elif ha_name == "HA117": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HAXX": return property_meta["Property Type"].split(":")[0].strip(), None elif ha_name == "HAXXX": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Unit Description"].strip()) built_form = None else: raise NotImplementedError("Implement me") return property_type, built_form def get_epc_data( loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True ): if not loader.data: raise ValueError("Data not found - please run loader.load() first") outputs = {} for ha_name, data_assets in loader.data.items(): if not pull_data: # Then we retrieve the data from S3 processed_ha_results = read_pickle_from_s3( bucket_name="retrofit-datalake-dev", s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" ) outputs[ha_name] = { "results_df": processed_ha_results["results_df"], "scoring_df": processed_ha_results["scoring_df"], "nodata": processed_ha_results["nodata"] } continue # For each HA, we read pull in the data required, and store in S3 asset_list = data_assets["asset_list"].copy() # If the survey list is missing, it means we have no yet completed any surveys and therefore should only # consider the most recent EPC consider_penultimate_epc = data_assets["survey_list"] is not None # We iterate through the asset list and pull what we need results = [] scoring_data = [] nodata = [] failed_model_rows = [] for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): if property_meta["matching_postcode"] is None: continue property_type, built_form = get_property_type_and_built_form( property_meta=property_meta, ha_name=ha_name ) searcher = SearchEpc( address1=str(property_meta["HouseNo"]), postcode=property_meta["matching_postcode"], auth_token=EPC_AUTH_TOKEN, os_api_key="", full_address=property_meta["matching_address"] ) searcher.ordnance_survey_client.property_type = property_type searcher.ordnance_survey_client.built_form = built_form searcher.find_property(skip_os=True) if searcher.newest_epc is None: nodata.append(property_meta) continue if searcher.newest_epc.get("estimated"): # We insert the row ID as our proxy for UPRN searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) newest_epc = searcher.newest_epc older_epcs = searcher.older_epcs full_sap_epc = searcher.full_sap_epc # If we have a survey list, we check the penultimate, because the property might have been installed penultimate_epc = newest_epc if consider_penultimate_epc: # We also want to get the penultimate epc penultimate_epc, _ = searcher.filter_newest_epc(older_epcs) if not penultimate_epc: penultimate_epc = newest_epc eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) eligibility.check_gbis_warmfront() eligibility.check_eco4_warmfront() # We check the conditions for checking the penultimate epc identified_for_gbis = property_meta["ECO Eligibility"] in ["gbis"] identified_for_eco4 = property_meta["ECO Eligibility"] in ["eco4"] subject_to_ciga = property_meta["ECO Eligibility"] in [ "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga" ] # condition 1 - identified for gbis and not eligible condition_1 = (identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront["eligible"] ) & consider_penultimate_epc # condition 2 - identified for eco4 and not eligible condition_2 = (identified_for_eco4 and not eligibility.eco4_warmfront[ "eligible"]) & consider_penultimate_epc # successfully identigied gbis condition_3 = ( identified_for_gbis and (eligibility.gbis_warmfront or eligibility.eco4_warmfront["eligible"]) ) # Nothing identified condition_4 = ( not identified_for_gbis and not identified_for_eco4 and not eligibility.gbis_warmfront and not subject_to_ciga and not eligibility.eco4_warmfront["eligible"] ) # Not identified but seemingly eligible for eco4 or gbis condition_5 = ( not identified_for_gbis and not identified_for_eco4 and ( eligibility.eco4_warmfront["eligible"] or eligibility.gbis_warmfront ) ) condition_6 = ( subject_to_ciga and not eligibility.eco4_warmfront["eligible"] ) if condition_1 or condition_2: # We check the penultimate epc eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) eligibility.check_gbis_warmfront() eligibility.check_eco4_warmfront() # If this is the case, we need to update the older epcs # We don't update just to make data cleaning easier if penultimate_epc.get("estimated") is None: older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] elif condition_3 or condition_4 or condition_5 or condition_6: pass else: NotImplementedError("Implement me") # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity # Loft MUST be suitable cavity_age = None if ( identified_for_eco4 and not eligibility.eco4_warmfront["eligible"] ): # We check the age of the cavity and if it's particularly old, we flag it cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned) if eligibility.eco4_warmfront["eligible"]: if eligibility.epc["uprn"] == "": eligibility.epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) try: scoring_dictionary = prepare_model_data_row( property_id=property_meta["asset_list_row_id"], modelling_epc=eligibility.epc, cleaned=cleaned, cleaning_data=cleaning_data, created_at=created_at, old_data=older_epcs, full_sap_epc=full_sap_epc, photo_supply_lookup=photo_supply_lookup, floor_area_decile_thresholds=floor_area_decile_thresholds ) scoring_data.extend(scoring_dictionary) except Exception as e: # If we fail, we just keep a record of it failed_model_rows.append( property_meta["asset_list_row_id"] ) results.append( { "row_id": property_meta["asset_list_row_id"], "uprn": eligibility.epc["uprn"], "is_estimated": searcher.newest_epc.get("estimated") is not None, "property_type": eligibility.epc["property-type"], "eco4_eligible": eligibility.eco4_warmfront["eligible"], "eco4_message": eligibility.eco4_warmfront["message"], "eco4_strict": eligibility.eco4_warmfront["strict"], "gbis_eligible": eligibility.gbis_warmfront["eligible"], "gbis_message": eligibility.gbis_warmfront["message"], "gbis_strict": eligibility.gbis_warmfront["strict"], "sap": float(eligibility.epc["current-energy-efficiency"]), # Property components "roof": eligibility.roof["clean_description"], "walls": eligibility.walls["clean_description"], "cavity_type": eligibility.cavity["type"], "heating": eligibility.epc["mainheat-description"], "tenure": eligibility.tenure, "date_epc": eligibility.epc["lodgement-date"], "loft_thickness": eligibility.roof["insulation_thickness"], "cavity_age": cavity_age, "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"], "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"] } ) results_df = pd.DataFrame(results) scoring_df = pd.DataFrame(scoring_data) results_df["post_install_sap"] = None results_df["eligibility_classification"] = None if not scoring_df.empty: scoring_df = scoring_df.drop( columns=[ "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", "carbon_ending" ] ) model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at) model_api.MODEL_PREFIXES = ["sap_change_predictions"] scoring_df["id"] = scoring_df["id"] + "phase=0" # We split up the scoring_df and score predictions = [] to_loop_over = range(0, scoring_df.shape[0], 400) for chunk in tqdm(to_loop_over, total=len(to_loop_over)): predictions_dict = model_api.predict_all( df=scoring_df.iloc[chunk:chunk + 400], bucket="retrofit-data-dev", prediction_buckets={ "sap_change_predictions": "retrofit-sap-predictions-dev", } ) predictions.append(predictions_dict["sap_change_predictions"]) predictions = pd.concat(predictions) predictions_size = predictions.shape[0] predictions = predictions.rename(columns={"property_id": "row_id"}).merge( results_df[["row_id", "sap"]], how="left", on="row_id" ) if predictions.shape[0] != predictions_size: raise ValueError("Predictions size has changed") predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] results_df = results_df.merge( predictions[["sap_uplift", "row_id"]], how="left", on="row_id" ) results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] eligibility_assessment = [] for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): # The upgrade requirements are dependent on the current SAP # If the property is an F or G, it only needs to upgrade to an % if row["sap"] <= 38: if row["post_install_sap"] >= 57: eligibility_classification = "highest confidence" elif row["post_install_sap"] >= 55: eligibility_classification = "high confidence" elif row["post_install_sap"] >= 53: eligibility_classification = "medium confidence" else: eligibility_classification = "unlikely" else: if row["post_install_sap"] >= 71: eligibility_classification = "highest confidence" elif row["post_install_sap"] >= 69: eligibility_classification = "high confidence" elif row["post_install_sap"] >= 67: eligibility_classification = "medium confidence" else: eligibility_classification = "unlikely" eligibility_assessment.append( { "row_id": row["row_id"], "eligibility_classification": eligibility_classification } ) eligibility_assessment = pd.DataFrame(eligibility_assessment) # Make sure the results haven't changed in size results_df = results_df.merge( eligibility_assessment, how="left", on="row_id" ) if results_df.shape[0] != len(results): raise ValueError("results has changed size") # We store the results in S3 as a pickle save_pickle_to_s3( data={ "results_df": results_df, "scoring_df": scoring_df, "nodata": nodata }, bucket_name="retrofit-datalake-dev", s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" ) outputs[ha_name] = { "results_df": results_df, "scoring_df": scoring_df, "nodata": nodata } return outputs def get_col_widths(dataframe): # Define a maximum width for any column to prevent excessively wide columns max_allowed_width = 25 # Calculate widths for columns widths = [] if isinstance(dataframe.columns, pd.MultiIndex): # For MultiIndex, calculate max width considering the header and data header_widths = [max(len(str(item)) for item in col) + 2 for col in dataframe.columns.values] # +2 for padding for i, column in enumerate(dataframe.columns): max_data_width = max(dataframe[column].astype(str).apply(len).max(), header_widths[i]) widths.append(min(max_data_width, max_allowed_width)) else: # For non-MultiIndex, calculate width normally for col in dataframe.columns: # Calculate the max length of data or column name and limit it max_length = max(dataframe[col].astype(str).apply(len).max(), len(str(col)) + 2) # +2 for padding widths.append(min(max_length, max_allowed_width)) return widths # def analyse_ha_data(outputs, loader): # """ # The approach we take within this function is the following: # For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The # characterisation can be broken down as the following: # 1) The property has been identified by Warmfront and is eligible for ECO4/GBIS work, under the strictest criteria # 2) The property has been identified by Warmfront, however it has a full cavity, and therefore would be subject to # a CIGA check # 3) The property has been identified by Warmfront, but the EPC shows that the property has more than 100mm loft # insulation # 4) The property has been identified by Warmfront, but doesn't look like a property that would likely qualify under # any cirsumstances, given the available data # # Then, for any property that has NOT been identifid by Warmfront, we identify properties that look like they would # qualify under the strictest criteria, and mark these as potential additional opportunities. # # :return: # """ # # eco4_rate = 1710 # gbis_rate = 600 # # old_eco4_rate = 1456 # old_gbis_rate = 432 # # epc_c_threshold = 80 # scheme_map = { # "ECO4": "ECO4", # "AFFORDABLE WARMTH": "ECO4", # "ECO4 A/W": "ECO4", # "ECO4 GBIS (ECO+)": "GBIS" # } # # ha_analysis_results = [] # total_revenue_results = [] # for ha_name, datasets in outputs.items(): # inputs = [x for k, x in loader.data.items() if k == ha_name][0] # # results_df = datasets["results_df"].copy() # # analysis_data = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility"]].rename( # columns={"row_meaning": "asset_identification_status"} # ).merge( # results_df, # how="left", # right_on="row_id", # left_on="asset_list_row_id" # ) # # analysis_data["is_remaining"] = True # # n_sold_eco4 = 0 # n_sold_gbis = 0 # if not inputs["survey_list"].empty: # # Merge on the survey list and signal everything that is remaining or not (i.e. anything that hasn't had # # a survey) # survey_list = inputs["survey_list"].copy() # # # TODO: TEMP # scheme_column = survey_list.columns[0] # # We clean up the survey list installation or cancelled # survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() # # Remove all punctuation # survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( # r'[^\w\s]', '', regex=True # ) # # Remove double spaces # survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( # r'\s+', ' ', regex=True # ) # # Remove trailing spaces # survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip() # # # Remap the values in the scheme column # survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map) # # survey_list["installation_status"] = None # survey_list["installation_status"] = np.where( # survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), # "installed", # survey_list["installation_status"] # ) # survey_list["installation_status"] = np.where( # survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), # "cancelled", # survey_list["installation_status"] # ) # # Find partial installations # survey_list["installation_status"] = np.where( # survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), # "partially installed", # survey_list["installation_status"] # ) # # Find partial cancellations # # TODO: We might have more indications of partial cancellations # survey_list["installation_status"] = np.where( # survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), # "partially cancelled", # survey_list["installation_status"] # ) # # # Finally, for other cases, we set the status to "in progress" # survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") # # # We concatenate the scheme name with the installation status # survey_list["installation_status"] = ( # survey_list[scheme_column] + " - " + survey_list["installation_status"] # ) # # # TODO: END TEMP # # survey_list_to_merge = survey_list[["asset_list_row_id", scheme_column]].copy() # survey_list_to_merge["is_remaining"] = False # analysis_data = analysis_data.drop(columns="is_remaining").merge( # survey_list_to_merge, # how="left", on="asset_list_row_id" # ) # analysis_data["is_remaining"] = analysis_data["is_remaining"].fillna(True) # # n_sold_eco4 = survey_list_to_merge[survey_list_to_merge[scheme_column] == "ECO4"].shape[0] # n_sold_gbis = survey_list_to_merge[survey_list_to_merge[scheme_column] == "GBIS"].shape[0] # # # Take just remaining # analysis_data = analysis_data[analysis_data["is_remaining"]] # # # Also, if the HA has started selling, we remove any that are still subject to ciga # n_eco4_missed_subject_to_ciga = 0 # if not inputs["survey_list"].empty: # n_eco4_missed_subject_to_ciga = (analysis_data["ECO Eligibility"] == "eco4 (subject to ciga)").sum() # analysis_data = analysis_data[analysis_data["ECO Eligibility"] != "eco4 (subject to ciga)"] # # ################################################################################################ # # We take the properties that strictly qualified under eco # ################################################################################################ # # eco4_identified = analysis_data[analysis_data["ECO Eligibility"] == "eco4"].copy() # eco4_identified["identification_type"] = None # eco4_identified["identification_type"] = np.where( # (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == True), # "strict", # eco4_identified["identification_type"] # ) # # # For expansive, the property can be no higher than an EPC C # eco4_identified["identification_type"] = np.where( # (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False) & ( # eco4_identified["sap"] <= epc_c_threshold # ), # "expansive", # eco4_identified["identification_type"] # ) # ################################################################################################ # # We take the properties dependent on CIGA # ################################################################################################ # # ciga_dependent_identified = analysis_data[ # analysis_data["ECO Eligibility"].isin( # [ # "eco4 (subject to ciga)", # "eco4 - passed ciga" # ] # ) # ].copy() # # # These are properties that show filled cavity # ciga_dependent_identified["identification_type"] = None # ciga_dependent_identified["identification_type"] = np.where( # ciga_dependent_identified["eco4_message"].isin( # [ # "Perfect suitability", # "Meets cavity and sap", # "Fails cavity, meets loft, fails SAP", # "Meets fabric, fails SAP check", # "Meets cavity, loft borderline, meets sap", # ] # ) & (ciga_dependent_identified["sap"] <= epc_c_threshold), # "strict", # ciga_dependent_identified["identification_type"] # ) # # ciga_dependent_identified["identification_type"] = np.where( # ((ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | ( # ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"]) # )) & ( # (ciga_dependent_identified["sap"] <= epc_c_threshold) & # pd.isnull(ciga_dependent_identified["identification_type"]) # ), # "expansive", # ciga_dependent_identified["identification_type"] # ) # # ################################################################################################ # # We properties that qualified for gbis # ################################################################################################ # gbis_identified = analysis_data[analysis_data["ECO Eligibility"] == "gbis"].copy() # gbis_identified["identification_type"] = None # gbis_identified["identification_type"] = np.where( # (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] < 69), # "strict", # gbis_identified["identification_type"] # ) # # gbis_identified["identification_type"] = np.where( # (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] <= epc_c_threshold) & ( # pd.isnull(gbis_identified["identification_type"]) # ), # "expansive", # gbis_identified["identification_type"] # ) # # # Finally, we look at the properties that have not been identified by Warmfront # not_identified = analysis_data[ # analysis_data["ECO Eligibility"].isin( # [ # "not eligible" # ] # ) # ].copy() # # surplus_eco4 = not_identified[ # (not_identified["eco4_eligible"] == True) & (not_identified["eco4_message"].isin( # ["Perfect suitability", "Meets cavity, loft borderline, meets sap", "Near perfect suitability"] # )) # ] # # surplus_gbis = not_identified[ # (not_identified["gbis_eligible"] == True) & ( # ~not_identified["asset_list_row_id"].isin(surplus_eco4["asset_list_row_id"].values) # ) & (not_identified["sap"] < 69) & ( # (not_identified["cavity_type"].isin(["empty", "partial insulation"])) | ( # not_identified["walls"].str.contains("partial", case=False, na=False) # ) # ) # ] # surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False] # # # Output variables - the data was sent to us in December, but the remaining figures are # # what was in November # november_remaining = loader.december_figures[loader.december_figures["HA Name"] == ha_name] # # # ECO4 # n_properties_remaining_in_asset_list = inputs["asset_list"].shape[0] # november_eco4_remaining = max(november_remaining["ECO4 remaining"].values[0], 0) # november_eco4_sold = november_remaining["No. of Tech surveys complete - Eco 4"].values[0] # eco4_sales_since_november = n_sold_eco4 - november_eco4_sold # # n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0] # eco4_of_which_identified_strict = ( # eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] + # ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "strict"].shape[0] # ) # eco4_of_which_identified_expansive = ( # eco4_identified[eco4_identified["identification_type"] == "expansive"].shape[0] + # ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "expansive"].shape[0] # ) # # GBIS # n_warmfront_identified_gbis = gbis_identified.shape[0] # november_gbis_remaining = max(november_remaining["GBIS remaining"].values[0], 0) # november_gbis_sold = november_remaining["No. of Tech surveys complete - GBIS"].values[0] # gbis_sales_since_november = n_sold_gbis - november_gbis_sold # gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0] # gbis_of_which_identified_expansive = \ # gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0] # # to_append = { # ("", "HA Name"): ha_name, # ("", "# properties in asset list"): n_properties_remaining_in_asset_list, # ############ # # ECO4 # ############ # ("ECO4", "# remaining November file"): november_eco4_remaining, # ("ECO4", "# sold in November file"): november_eco4_sold, # ("ECO4", "# sold (survey list)"): n_sold_eco4, # ("ECO4", "# that missed CIGA check"): n_eco4_missed_subject_to_ciga, # ("ECO4", "# Remaining properties (asset list)"): n_warmfront_identified_eco4, # ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict, # ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive, # ("ECO4", "Of which identified by model - total"): ( # eco4_of_which_identified_strict + eco4_of_which_identified_expansive # ), # ("ECO4", "Additional properties"): surplus_eco4.shape[0], # ############ # # GBIS # ############ # ("GBIS", "# remaining November file"): november_gbis_remaining, # ("GBIS", "# sold in November file"): november_gbis_sold, # ("GBIS", "# sold (survey list)"): n_sold_gbis, # ("GBIS", "# Remaining properties (asset list)"): n_warmfront_identified_gbis, # ("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict, # ("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive, # ("GBIS", "Of which identified by model - total"): ( # gbis_of_which_identified_strict + gbis_of_which_identified_expansive # ), # ("GBIS", "Additional properties"): surplus_gbis.shape[0] # } # # ha_analysis_results.append(to_append) # # # Calculate the revenue results # to_append_revenue = { # ("", "HA Name"): ha_name, # # Eco4 revenue # ("ECO4", "£ remaining November file"): november_eco4_remaining * eco4_rate, # ("ECO4", "£ sold November file"): november_eco4_sold * old_eco4_rate, # ("ECO4", "£ sold since November"): eco4_sales_since_november * eco4_rate, # ("ECO4", "£ stuck at ciga check"): n_eco4_missed_subject_to_ciga * eco4_rate, # ("ECO4", "£ remaining (asset list)"): n_warmfront_identified_eco4 * eco4_rate, # ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict * eco4_rate, # ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive * eco4_rate, # ("ECO4", "Of which identified by model - total"): eco4_rate * ( # eco4_of_which_identified_strict + eco4_of_which_identified_expansive # ), # ("ECO4", "Additional properties"): eco4_rate * surplus_eco4.shape[0], # } # total_revenue_results.append(to_append_revenue) # # ha_analysis_results = pd.DataFrame(ha_analysis_results) # ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns) # # facts_and_figures = loader.facts_and_figures.copy() # facts_and_figures["ha_number"] = facts_and_figures["HA Name"].str.extract(r'(\d+)').astype(int) # facts_and_figures = facts_and_figures.sort_values("ha_number") # facts_and_figures = facts_and_figures.drop(columns=["ha_number"]) # # # Rename some of the cols # facts_and_figures = facts_and_figures.rename( # columns={ # # ECO4 cols # "ECO4": "ECO4 - November", # "GBIS": "GBIS - November", # "eco4 (subject to ciga)": "ECO4 - subject to ciga", # "eco4": "ECO4 - doesn't need CIGA", # "eco4 - passed ciga": "ECO4 - passed CIGA", # "failed ciga": "ECO4 - failed CIGA", # "ECO4 - partially cancelled": "ECO4 - Install downgrade to GBIS", # "ECO4 - in progress": "ECO4 - Install in progress", # "ECO4 - cancelled": "ECO4 - Install cancelled", # # GBIS cols # "gbis": "GBIS total (asset list)" # } # ) # # We calculate the eco4 total from the asset list # # 1) If ciga checks have been completed (i.e. ECO4 - passed ciga > 0) this sum is # # ECO4 - doesn't need CIGA + ECO4 - passed CIGA # # 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is # # ECO4 - doesn't need CIGA + ECO4 - subject to ciga # facts_and_figures["ECO4 total (asset list - pre ciga)"] = ( # facts_and_figures["ECO4 - doesn't need CIGA"] + # facts_and_figures["ECO4 - subject to ciga"] + # facts_and_figures["ECO4 - passed CIGA"] # ) # # facts_and_figures["ECO4 total (asset list - post ciga)"] = None # facts_and_figures["ECO4 total (asset list - post ciga)"] = np.where( # facts_and_figures["ECO4 - passed CIGA"] > 0, # facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"], # facts_and_figures["ECO4 total (asset list - post ciga)"] # ) # # # Re-arrange the columns # facts_and_figures = facts_and_figures[ # [ # 'HA Name', # 'ECO4 - November', # 'GBIS - November', # 'ECO4 total (asset list - pre ciga)', # 'ECO4 total (asset list - post ciga)', # 'GBIS total (asset list)', # 'ECO4 - subject to ciga', # "ECO4 - doesn't need CIGA", # 'ECO4 - passed CIGA', # 'ECO4 - failed CIGA', # 'ECO4 - installed', # 'ECO4 - Install in progress', # 'ECO4 - Install cancelled', # 'ECO4 - partially installed', # 'ECO4 - Install downgrade to GBIS', # ] # ] # # Addd a note to flag any rows where ECO4 ( # # subject to ciga is greater than 0) and (ECO4 - passed ciga is greater than 0 # # ) # facts_and_figures["Missed CIGA checks opportunity"] = None # facts_and_figures["Missed CIGA checks opportunity"] = np.where( # (facts_and_figures["ECO4 - subject to ciga"] > 0) & (facts_and_figures["ECO4 - passed CIGA"] > 0), # "potential opportunity of " + facts_and_figures["ECO4 - subject to ciga"].astype( # str) + " ECO4 properties needing a CIGA check", # facts_and_figures["Missed CIGA checks opportunity"] # ) # # facts_and_figures.to_csv("Facts and figures sample.csv") # # # Re arrage the columns # # # Also sort ha_analysis_results by ha number # ha_analysis_results["ha_number"] = ha_analysis_results[("", "HA Name")].str.extract(r'(\d+)').astype(int) # ha_analysis_results = ha_analysis_results.sort_values("ha_number") # ha_analysis_results = ha_analysis_results.drop(columns=["ha_number"]) # # # We save 2 sheets # # Automate creation of the excel # # Create a Pandas Excel writer using XlsxWriter as the engine # with pd.ExcelWriter('HA Analysis Results.xlsx', engine='xlsxwriter') as writer: # # Write each dataframe to a different worksheet without the index # for df, sheet in [(facts_and_figures, 'HA Facts and Figures'), # (ha_analysis_results, 'Asset Identification')]: # # df.to_excel(writer, sheet_name=sheet) # # # Auto-adjust columns' width # for i, width in enumerate(get_col_widths(df)): # writer.sheets[sheet].set_column(i, i, width) # # # Inspection: - Looking into the proportion of homes with "cavity, as built, insulated (assumed)" as their # # description, and what proportion of time they get identified via non-invasive surveys # # # true_eco4_assets = [] # # ciga_dependent_assets = [] # # not_eligible = [] # # as_built_insulated = [] # # date_cols = { # # "HA39": "date_built", # # "HA14": "Built In Year", # # "HA6": "Construction Year", # # "HA1": "Build Date", # # "HA107": "YEAR BUILT" # # } # # for ha_name, data_objects in outputs.items(): # # inputs = [x for k, x in loader.data.items() if k == ha_name][0] # # # # date_col = date_cols[ha_name] # # results_df = data_objects["results_df"].copy() # # df = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility", date_col]].rename( # # columns={"row_meaning": "asset_identification_status", date_col: "date_built"} # # ).merge( # # results_df, # # how="left", # # right_on="row_id", # # left_on="asset_list_row_id" # # ) # # # # # take the true ECO4 # # true_eco4 = df[df["ECO Eligibility"] == "eco4"].copy() # # ciga_dependent = df[ # # df["ECO Eligibility"].isin( # # [ # # "eco4 (subject to ciga)", # # "failed ciga", # # "eco4 - passed ciga" # # ] # # ) # # ] # # insulated_assumed = df[df["walls"] == "Cavity wall, as built, insulated"].copy() # # # We convert date built to datetime # # try: # # insulated_assumed = insulated_assumed[~pd.isnull(insulated_assumed["date_built"])] # # insulated_assumed["year_built"] = pd.to_datetime(insulated_assumed["date_built"].astype(str)).dt.year # # as_built_insulated.append(insulated_assumed) # # except Exception as e: # # print("oh well") # # # # true_eco4_assets.append(true_eco4) # # ciga_dependent_assets.append(ciga_dependent) # # # # true_eco4_assets = pd.concat(true_eco4_assets) # # ciga_dependent_assets = pd.concat(ciga_dependent_assets) # # as_built_insulated = pd.concat(as_built_insulated) # # # # true_eco4_assets["walls"].value_counts(normalize=True) # # ciga_dependent_assets["walls"].value_counts(normalize=True) # # # # from recommendations.recommendation_utils import extract_insulation_thickness # # # # true_eco4_assets["roof_insulation_thickness"] = true_eco4_assets["roof"].apply( # # lambda x: extract_insulation_thickness(x) # # ) # # # # true_eco4_assets["e"] = true_eco4_assets.merge( # # pd.DataFrame(cleaned["roof-description"])[["original_description", "insulation_thickness"]], # # how="left", # # left_on="roof", # # right_on="original_description" # # ) # # # # true_eco4_assets["sap"].mean() # # # # true_eco4_assets["insulation_thickness"].isin( # # ["250", "150", "200", "100", "75", "50"] # # ).sum() / true_eco4_assets.shape[0] # # # # true_eco4_assets["insulation_thickness"].isin( # # ["100"] # # ).sum() / true_eco4_assets.shape[0] # # # # as_built_insulated.groupby("property_type")["ECO Eligibility"].value_counts(normalize=True) def get_propensity_model_data( loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True ): # TODO: Set a seed! model_data = [] for ha_name, data_assets in loader.data.items(): logger.info("Processing HA: %s", ha_name) if data_assets["survey_list"].empty: continue number_sold = data_assets["survey_list"].shape[0] # For each HA, we read pull in the data required, and store in S3 asset_list = data_assets["asset_list"].copy() # We determine the number of properties that we should select that are eligible asset_list_size = asset_list.shape[0] # Number eligible n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0] success_rate = n_eligibile / asset_list_size needed_sample_size = np.ceil(number_sold / success_rate) number_negative_samples = int(needed_sample_size - number_sold) sold_asset_list_ids = data_assets["survey_list"]["asset_list_row_id"].tolist() negative_sample_asset_list_ids = asset_list["asset_list_row_id"].sample(number_negative_samples).tolist() sample_ids = sold_asset_list_ids + negative_sample_asset_list_ids sample_asset_list = asset_list[asset_list["asset_list_row_id"].isin(sample_ids)] # In order to have the most confidence, we should take just properties that have 1 EPC. We might need to # cut down the number of properties that we include because of this # Note: This is an imbalanced problem so we will need to build a model accomadating of that data = [] errors = [] for index, property_meta in tqdm(sample_asset_list.iterrows(), total=len(sample_asset_list)): if property_meta["matching_postcode"] is None: continue property_type, built_form = get_property_type_and_built_form( property_meta=property_meta, ha_name=ha_name ) searcher = SearchEpc( address1=str(property_meta["HouseNo"]), postcode=property_meta["matching_postcode"], auth_token=EPC_AUTH_TOKEN, os_api_key="", full_address=property_meta["matching_address"] ) searcher.ordnance_survey_client.property_type = property_type searcher.ordnance_survey_client.built_form = built_form searcher.find_property(skip_os=True) if searcher.newest_epc is None: continue if searcher.newest_epc.get("estimated"): # We insert the row ID as our proxy for UPRN searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) newest_epc = searcher.newest_epc older_epcs = searcher.older_epcs full_sap_epc = searcher.full_sap_epc # If we have more than 1 EPC for the moment we just continue if older_epcs or full_sap_epc: continue try: # We clean up the data epc_records = { 'original_epc': newest_epc.copy(), 'full_sap_epc': full_sap_epc.copy(), 'old_data': older_epcs.copy(), } epc_record = EPCRecord( epc_records=epc_records, run_mode="newdata", cleaning_data=cleaning_data ) # If we have some data, continue data.append( { "ECO Eligibility": property_meta["ECO Eligibility"], "asset_list_row_id": property_meta["asset_list_row_id"], **epc_record.get("prepared_epc") } ) except Exception as e: errors.append( { "error": str(e), "asset_list_row_id": property_meta["asset_list_row_id"], "matching_postcode": property_meta["matching_postcode"], "matching_address": property_meta["matching_address"] } ) data = pd.DataFrame(data) # We store the results in S3 as a pickle save_pickle_to_s3( data=data, bucket_name="retrofit-datalake-dev", s3_file_name=f"propensity_model_data/{ha_name}/train.pickle" ) # Store the errors if errors: save_pickle_to_s3( data=errors, bucket_name="retrofit-datalake-dev", s3_file_name=f"propensity_model_data/{ha_name}/errors.pickle" ) model_data.append(data) return model_data def conversion_model(loader): # Read in the model data model_data = [] for ha_name in loader.data.keys(): try: picked = read_pickle_from_s3( bucket_name="retrofit-datalake-dev", s3_file_name=f"propensity_model_data/{ha_name}/train.pickle" ) data = pd.DataFrame(picked) # We merge on the sales data sales_data = loader.data[ha_name]["survey_list"].copy() data = data.merge( sales_data[["asset_list_row_id", "installation_status"]], how="left", on="asset_list_row_id" ) data["ha_name"] = ha_name except Exception as e: logger.error("Error reading in the data for %s", ha_name) continue model_data.append(data) model_data = pd.concat(model_data) model_data["response"] = model_data["installation_status"].isin( [ "ECO4 - in progress", "ECO4 - installed" ] ).astype(int) # Because of how we pulled the data, we need to re-balance the sample ha_names = model_data["ha_name"].unique() balanced_sample = [] for ha_name in ha_names: df = model_data[model_data["ha_name"] == ha_name] positive_samples = df[df["response"] == 1] negative_samples = df[df["response"] != 1] inputs = [x for k, x in loader.data.items() if k == ha_name][0] asset_list = inputs["asset_list"].copy() asset_list_size = asset_list.shape[0] n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0] success_rate = n_eligibile / asset_list_size needed_sample_size = np.ceil(positive_samples.shape[0] / success_rate) number_negative_samples = int(needed_sample_size - positive_samples.shape[0]) negative_samples_subset = negative_samples.sample(number_negative_samples) output = pd.concat([positive_samples, negative_samples_subset]) balanced_sample.append(output) balanced_sample = pd.concat(balanced_sample) # We work with a small sample # Drop the ECO Eligibility column and installation_status column # We keep the ID column balanced_sample = balanced_sample.drop( columns=['ECO Eligibility', 'asset_list_row_id', 'address', 'uprn_source', 'address3', 'local_authority_label', 'county', 'postcode', 'constituency', 'local_authority', 'inspection_date', 'address1', 'constituency_label', 'building_reference_number', 'address2', 'posttown', 'lodgement_datetime', 'uprn', 'lodgement_date', 'lmk_key', 'installation_status', 'ha_name'] ) # POC model df = balanced_sample.copy() # FIll missings with means, if they exist numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean()) categorical_cols = df.select_dtypes(include=['object', 'category']).columns df[categorical_cols] = df[categorical_cols].fillna("other") # Reduce the number of categories to a specific number and the rest to other max_n_categories = 10 for col in categorical_cols: top_categories = df[col].value_counts().nlargest(max_n_categories).index df[col] = df[col].where(df[col].isin(top_categories), other="other") # Use a model based approach to feature selection import xgboost as xgb from sklearn.model_selection import train_test_split # Assuming your outcome column is named 'target' X = df.drop(columns=['response']) y = df['response'] df["low_energy_fixed_light_count"].va # Encoding categorical variables if not already done X = pd.get_dummies(X, drop_first=True) # Splitting the data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Initialize an XGBoost classifier model = xgb.XGBClassifier() # Fit the model model.fit(X_train, y_train) # Get feature importances feature_importances = model.feature_importances_ # Map feature importances to their corresponding column names feature_importance_dict = {feature: importance for feature, importance in zip(X.columns, feature_importances)} # Sort features by importance sorted_features = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True) # Display sorted features for feature, importance in sorted_features: print(f"{feature}: {importance}") def patch_cleaned(cleaned): # Patch to handle the a missing description cleaned["floor-description"].extend( [ {'original_description': 'To external air, uninsulated (assumed)', 'clean_description': 'To external air, no insulation', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': False, 'is_to_external_air': True, 'is_suspended': False, 'is_solid': False, 'another_property_below': False, 'insulation_thickness': 'none'}, {'original_description': 'To unheated space, uninsulated (assumed)', 'clean_description': 'To unheated space, uninsulated', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': True, 'is_to_external_air': False, 'is_suspended': False, 'is_solid': False, 'another_property_below': False, 'insulation_thickness': 'average'} ] ) cleaned["roof-description"].extend( [ {'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True, 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'} ] ) cleaned["roof-description"].extend( [ {'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True, 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'} ] ) cleaned["roof-description"].extend( [ {'original_description': 'Pitched, 300+mm loft insulation', 'clean_description': 'Pitched, 300+ mm loft insulation', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True, 'is_roof_room': False, 'is_loft': True, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '300+' } ] ) thermal_transmittance_values = list(np.arange(0, 2, 0.01)) for ttv in thermal_transmittance_values: ttv_roundeded = round(ttv, 2) # We look for an instance of that thermal transmittance value rec = [ x for x in cleaned["roof-description"] if (x["thermal_transmittance"] == ttv_roundeded) and "Average thermal transmittance" in x["clean_description"] ] if rec: continue else: # We patch the record cleaned["roof-description"].extend( [{'original_description': f'Average thermal transmittance {ttv_roundeded} W/m-¦K', 'clean_description': f'Average thermal transmittance {ttv_roundeded} w/m-¦k', 'thermal_transmittance': ttv_roundeded, 'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}] ) # We also patch a funny unit value we found for ttv in thermal_transmittance_values: ttv_rounded = round(ttv, 2) # We look for an instance of that thermal transmittance value rec = [ x for x in cleaned["roof-description"] if (x["thermal_transmittance"] == ttv_rounded) and "Average thermal transmittance" in x["clean_description"] and x["thermal_transmittance_unit"] == "w/m?K" ] if rec: continue else: # We patch the record ttv_string = str(ttv_rounded) if len(ttv_string) == 3: ttv_string = f"{ttv_string}0" cleaned["roof-description"].extend( [{'original_description': f'Average thermal transmittance {ttv_string} W/m?K', 'clean_description': f'Average thermal transmittance {ttv_string} w/m-¦k', 'thermal_transmittance': ttv_rounded, 'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}] ) # Patch mainheatcont-description cleaned["mainheatcont-description"].extend( [ {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None, 'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None, 'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None, 'rate_control': None} ] ) # We patch this record because there is another property below for x in cleaned["floor-description"]: if x["original_description"] == '(Same dwelling below) insulated (assumed)': x["another_property_below"] = True x["thermal_transmittance"] = 0 return cleaned def calculate_eco4_post_ciga( eligiblity_counts, input_data, ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate, eco4_rate, archetype_conversion_rate ): remaining_needing_ciga_check = eligiblity_counts[ eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") & ~eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype") ]["count"].sum() remaining_needing_ciga_and_archetype_check = eligiblity_counts[ eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") & eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype") ]["count"].sum() # We scale this down by the archetype_conversion_rate, and add this on to the remaining_needing_ciga_check remaining_needing_ciga_and_archetype_check_passed = np.round( remaining_needing_ciga_and_archetype_check * archetype_conversion_rate ) remaining_needing_ciga_check += remaining_needing_ciga_and_archetype_check_passed eco4_no_ciga_needed = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "eco4" ]["count"].sum() eco4_no_ciga_archetype_needed = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "eco4 (subject to archetype)" ]["count"].sum() eco4_no_ciga_archetype_needed_passed = np.round( eco4_no_ciga_archetype_needed * archetype_conversion_rate ) eco4_no_ciga_needed += eco4_no_ciga_archetype_needed_passed failed_archetype_check = int( remaining_needing_ciga_and_archetype_check + eco4_no_ciga_archetype_needed - remaining_needing_ciga_and_archetype_check_passed - eco4_no_ciga_archetype_needed_passed ) has_ciga_check = not input_data["ciga_list"].empty if has_ciga_check: eco4_ciga_passed = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "eco4 - passed ciga" ]["count"].sum() eco4_confirmed_ciga_failures = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "failed ciga" ]["count"].sum() eco4_no_ciga_needed_or_ciga_passed = eco4_no_ciga_needed + eco4_ciga_passed eco4_confirmed = np.round( (eco4_no_ciga_needed * ha_eco4_to_sale_rate) + (eco4_ciga_passed * ha_ciga_pass_to_sale_rate) ) eco4_no_ciga_needed_cancellations = int(eco4_no_ciga_needed_or_ciga_passed - eco4_confirmed) if remaining_needing_ciga_check > 0: # We update the eco4 post ciga with the converted remaining eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) eco4_remaining_forecast = np.round( eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate ) eco4_ciga_needed_cancellations = eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast else: eco4_remaining_forecast = 0 eco4_estimated_ciga_failures = 0 eco4_ciga_needed_cancellations = 0 eco4_post_ciga = eco4_confirmed eco4_expected_cancellations = eco4_no_ciga_needed_cancellations + eco4_ciga_needed_cancellations else: eco4_confirmed_ciga_failures = 0 # Multiply by sale conversion eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate) eco4_no_ciga_cancellations = int(eco4_no_ciga_needed - eco4_confirmed) eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass eco4_remaining_forecast = np.round( eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate ) eco4_ciga_cancellations = int(eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast) eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast eco4_expected_cancellations = eco4_no_ciga_cancellations + eco4_ciga_cancellations eco4_post_ciga = int(eco4_post_ciga) eco4_remaining_forecast = int(eco4_remaining_forecast) eco4_confirmed = int(eco4_confirmed) results = { # Counts "ECO4 - post CIGA - #": eco4_post_ciga, "Of which confirmed - #": eco4_confirmed, "Of which forecast - #": eco4_remaining_forecast, # Revenue "ECO4 - post CIGA - £": eco4_post_ciga * eco4_rate, "Of which confirmed - £": eco4_confirmed * eco4_rate, "Of which forecast - £": eco4_remaining_forecast * eco4_rate, # Archetype check failures "Estimated total - failed archetype check - #": failed_archetype_check, "Estimated total - failed archetype check - £": failed_archetype_check * eco4_rate, # Ciga failures "Estimated total - failed CIGA": int(eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures), "Confirmed CIGA failures": eco4_confirmed_ciga_failures, "Estimated CIGA failures": int(eco4_estimated_ciga_failures), # Ciga failures cost "Estimated total - failed CIGA - £": int( (eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures) * eco4_rate ), "Confirmed CIGA failures - £": int(eco4_confirmed_ciga_failures * eco4_rate), "Estimated CIGA failures - £": int(eco4_estimated_ciga_failures * eco4_rate), # Expected cencellations "Expected cancellations - #": eco4_expected_cancellations, "Expected cancellations - £": eco4_expected_cancellations * eco4_rate } return results def forecast_remaining_sales(loader): # Assumptions: # We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate # and I don't want the numbers to change too much, depenent on the CIGA conversation rate maximum_ciga_conversion = 0.75 # This is a hard limit to the allowed conversion rates to final sale. These are typically very # high but there are some anomalies, amongst surveys that are early on sales_conversion_lower_bound = 0.8 gbis_rate = 600 eco4_rate = 1710 # Based on ONS https://www.ons.gov.uk/peoplepopulationandcommunity/housing/bulletins/housingenglandandwales # /census2021 # there are 5.7 million terraced properties in the UK, of the 19.3 million houses or bungalows. We therefore apply # a 30% discount to homes that are dependent on an archetype check, since around 30% of them will be mid terraced # This 30% is slightly harsh but we be conservative # Therefore, the archetype check conversion rate is 70% archetype_conversion_rate = 0.7 # 1) Calculate the conversion rate from passed CIGA to actual sale converted_ciga_jobs = [] for ha_name, input_data in loader.data.items(): asset_list = input_data["asset_list"].copy() survey_list = input_data["survey_list"].copy() if survey_list.empty: continue ciga_dependent_assets = asset_list[ asset_list["ECO Eligibility"] == "eco4 - passed ciga" ] # These are now the ciga dependent assets at installation ciga_dependent_assets_at_installation = ciga_dependent_assets.merge( survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id" ) # We then calculate how many get cancelled ciga_dependent_assets_sold = ciga_dependent_assets_at_installation[ ciga_dependent_assets_at_installation["installation_status"].isin( [ "ECO4 - installed", "ECO4 - in progress" ] ) ] ciga_dependent_assets_failed = ciga_dependent_assets_at_installation[ ~ciga_dependent_assets_at_installation["installation_status"].isin( [ "ECO4 - installed", "ECO4 - in progress" ] ) ] converted_ciga_jobs.append( { "HA Name": ha_name, "# Ciga dependent at installation": ciga_dependent_assets_at_installation.shape[0], "# Ciga dependent successfully installed": ciga_dependent_assets_sold.shape[0], "# Ciga dependent failed install": ciga_dependent_assets_failed.shape[0] } ) converted_ciga_jobs = pd.DataFrame(converted_ciga_jobs) # We calculate a ciga pass to install conversaion rate median_ciga_pass_to_install = ( converted_ciga_jobs["# Ciga dependent successfully installed"].sum() / converted_ciga_jobs["# Ciga dependent at installation"].sum() ) # 2) Calculate the conversion rate from CIGA dependent to ciga passed ciga_passrates = [] for ha_name, input_data in loader.data.items(): # If we don't have a ciga list, we can't do anything if input_data["ciga_list"].empty: continue # 1) Calculate the conversion rate for CIGA to actual sale asset_list = input_data["asset_list"].copy() ciga_completed_assets = asset_list[ asset_list["ECO Eligibility"].isin( [ "eco4 - passed ciga", "failed ciga" ] ) ] ciga_passed = ciga_completed_assets[ ciga_completed_assets["ECO Eligibility"].isin( [ "eco4 - passed ciga" ] ) ] ciga_passrates.append( { "Ha Name": ha_name, "# CIGA dependent": ciga_completed_assets.shape[0], "# CIGA passed": ciga_passed.shape[0], } ) ciga_passrates = pd.DataFrame(ciga_passrates) median_ciga_success_rate = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum() # 3) Calculate the conversion rate of an ECO4 and a GBISjob, that doesn't need ciga, to install eco4_ciga_independent_to_install = [] gbis_to_install = [] for ha_name, input_data in loader.data.items(): asset_list = input_data["asset_list"].copy() survey_list = input_data["survey_list"].copy() if survey_list.empty: continue # For properties that were identified as a typical ECO4 job, we calculate the number of properties that # installed # vs cancelled typical_eco4 = asset_list[asset_list["ECO Eligibility"] == "eco4"] typical_gbis = asset_list[asset_list["ECO Eligibility"] == "gbis"] # Merge on the surveys typical_eco4_installed = typical_eco4.merge( survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id" ) if not typical_eco4_installed.empty: typical_eco4_sold = typical_eco4_installed[ typical_eco4_installed["installation_status"].isin( [ "ECO4 - installed", "ECO4 - in progress" ] ) ] eco4_ciga_independent_to_install.append( { "Ha Name": ha_name, "# ECO4 at install stage": typical_eco4_installed.shape[0], "# ECO4 successfully installed": typical_eco4_sold.shape[0] } ) typical_gbis_installed = typical_gbis.merge( survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id" ) if not typical_gbis_installed.empty: typical_gbis_sold = typical_gbis_installed[ typical_gbis_installed["installation_status"].isin( [ "GBIS - in progress", "GBIS - installed" ] ) ] gbis_to_install.append( { "Ha Name": ha_name, "# GBIS at install stage": typical_gbis_installed.shape[0], "# GBIS successfully installed": typical_gbis_sold.shape[0] } ) eco4_ciga_independent_to_install = pd.DataFrame(eco4_ciga_independent_to_install) gbis_to_install = pd.DataFrame(gbis_to_install) eco4_ciga_independent_to_install["conversion"] = ( eco4_ciga_independent_to_install["# ECO4 successfully installed"] / eco4_ciga_independent_to_install["# ECO4 at install stage"] ) eco4_ciga_independent_to_install_clipped = eco4_ciga_independent_to_install[ eco4_ciga_independent_to_install["conversion"] >= sales_conversion_lower_bound ] gbis_to_install["conversion"] = ( gbis_to_install["# GBIS successfully installed"] / gbis_to_install["# GBIS at install stage"] ) gbis_to_install_clipped = gbis_to_install[ gbis_to_install["conversion"] >= sales_conversion_lower_bound ] median_eco4_to_install = ( eco4_ciga_independent_to_install_clipped["# ECO4 successfully installed"].sum() / eco4_ciga_independent_to_install_clipped["# ECO4 at install stage"].sum() ) median_gbis_to_install = ( gbis_to_install_clipped["# GBIS successfully installed"].sum() / gbis_to_install_clipped["# GBIS at install stage"].sum() ) # Produce the final output december_figures = loader.december_figures.copy() december_figures = december_figures.fillna(0) # If we have negative remaining, it means that actually sold more gbis than they initially thought so we set # remaining to 0 december_figures["ECO4 remaining"] = np.where( december_figures["ECO4 remaining"] < 0, 0, december_figures["ECO4 remaining"] ) december_figures["GBIS remaining"] = np.where( december_figures["GBIS remaining"] < 0, 0, december_figures["GBIS remaining"] ) results = [] for ha_name, input_data in loader.data.items(): # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] if original_warmfront_estimates.empty: # Append an empty row original_warmfront_estimates = december_figures.head(1).copy() for k in original_warmfront_estimates.columns: original_warmfront_estimates[k] = 0 original_warmfront_estimates["HA Name"] = ha_name original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0] original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0] original_warmfront_sold_eco4 = ( original_warmfront_estimates["No. of Tech surveys complete - Eco 4"].values[0] * eco4_rate ) original_warmfront_eco4_revenue = original_warmfront_eco4 * eco4_rate original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate original_warmfront_sold_gbis = ( original_warmfront_estimates["No. of Tech surveys complete - GBIS"].values[0] * gbis_rate ) # Original warmfront figures - GBIS original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0] original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0] original_warmfront_gbis_revenue = ( original_warmfront_gbis * gbis_rate ) original_warmfront_remaining_gbis_revenue = original_warmfront_remaining_gbis * gbis_rate # Asset list - ECO4 asset_list = input_data["asset_list"].copy() survey_list = input_data["survey_list"].copy() if survey_list.empty: asset_list_remaining = asset_list.copy() else: # For HA6, there are a small number of postcodes that do not match to any item in the asset list survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])] asset_list_remaining = asset_list.merge( survey_list[["asset_list_row_id", "installation_status"]], how="left", on="asset_list_row_id" ) # Anything that has an installation has gone to installation, and therefore is not remaining asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])] asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"]) eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index() eligiblity_counts_remaining = pd.DataFrame(asset_list_remaining["ECO Eligibility"].value_counts()).reset_index() eco4_pre_ciga = eligiblity_counts[ eligiblity_counts["ECO Eligibility"].isin( [ "eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga", "eco4 (subject to ciga) (subject to archetype)", "eco4 (subject to archetype)" ] ) ]["count"].sum() eco4_pre_ciga_remaining = eligiblity_counts_remaining[ eligiblity_counts_remaining["ECO Eligibility"].isin( [ "eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga", "eco4 (subject to ciga) (subject to archetype)", "eco4 (subject to archetype)" ] ) ]["count"].sum() eco4_pre_ciga_revenue = eco4_pre_ciga * eco4_rate eco4_pre_ciga_remaining_revenue = eco4_pre_ciga_remaining * eco4_rate # Total Eligible - this is what passed ciga checks + strict. If we don't have what passed CIGA, we estimate # We check if the HA has done a CIGA check. Also, if we have assets dormant at CIGA, we estimate what will # convert # We estimate a conversion for anything left post CIGA ha_ciga_conversion = ciga_passrates[ciga_passrates["Ha Name"] == ha_name] if not ha_ciga_conversion.empty: ha_ciga_conversion_rate = ( ha_ciga_conversion["# CIGA passed"].values[0] / ha_ciga_conversion["# CIGA dependent"].values[0] ) else: ha_ciga_conversion_rate = ( median_ciga_success_rate if median_ciga_success_rate <= maximum_ciga_conversion else maximum_ciga_conversion ) # We also need the ha ciga passed to install success rate ha_ciga_pass_to_sale = converted_ciga_jobs[converted_ciga_jobs["HA Name"] == ha_name] if not ha_ciga_pass_to_sale.empty and ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0] != 0: ha_ciga_pass_to_sale_rate = ( ha_ciga_pass_to_sale["# Ciga dependent successfully installed"].values[0] / ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0] ) else: ha_ciga_pass_to_sale_rate = median_ciga_pass_to_install ha_eco4_to_sale = eco4_ciga_independent_to_install_clipped[ eco4_ciga_independent_to_install_clipped["Ha Name"] == ha_name ] if not ha_eco4_to_sale.empty: ha_eco4_to_sale_rate = ( ha_eco4_to_sale['# ECO4 successfully installed'].values[0] / ha_eco4_to_sale['# ECO4 at install stage'].values[0] ) else: ha_eco4_to_sale_rate = median_eco4_to_install eco4_post_ciga_total_results = calculate_eco4_post_ciga( eligiblity_counts=eligiblity_counts, input_data=input_data, ha_ciga_conversion_rate=ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate=ha_eco4_to_sale_rate, eco4_rate=eco4_rate, archetype_conversion_rate=archetype_conversion_rate ) eco4_post_ciga_remaining_results = calculate_eco4_post_ciga( eligiblity_counts=eligiblity_counts_remaining, input_data=input_data, ha_ciga_conversion_rate=ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate=ha_eco4_to_sale_rate, eco4_rate=eco4_rate, archetype_conversion_rate=archetype_conversion_rate ) # Calculate the delta compared to Warmfront's original remaining if original_warmfront_remaining_eco4 == 0: eco4_delta_vs_original_estimate_remaining = "N/A" else: eco4_delta_vs_original_estimate_remaining = ((eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] - original_warmfront_remaining_eco4) / original_warmfront_remaining_eco4) # GBIS Figures # Estimate the GBIS conversion rate ha_gbis_sale_conversion = gbis_to_install_clipped[ gbis_to_install_clipped["Ha Name"] == ha_name ] if not ha_gbis_sale_conversion.empty: ha_gbis_sale_conversion = ( ha_gbis_sale_conversion["# GBIS successfully installed"].values[0] / ha_gbis_sale_conversion["# GBIS at install stage"].values[0] ) else: ha_gbis_sale_conversion = median_gbis_to_install gbis_total_pre_cancellations = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "gbis" ]["count"].sum() gbis_total_pre_cancellations_revenue = gbis_total_pre_cancellations * gbis_rate # gbis_total = int(np.round(gbis_total_pre_cancellations * ha_gbis_sale_conversion)) # gbis_total_revenue = int(gbis_total * gbis_rate) gbis_remaining_pre_cancellations = eligiblity_counts_remaining[ eligiblity_counts_remaining["ECO Eligibility"] == "gbis" ]["count"].sum() gbis_remaining_pre_cancellations_revenue = ( gbis_remaining_pre_cancellations * gbis_rate ) # This is the gbis jobs we expect to sell gbis_remaining = int(np.round(gbis_remaining_pre_cancellations * ha_gbis_sale_conversion)) gbis_remaining_revenue = int(gbis_remaining * gbis_rate) # This is the number we expect to cancel gbis_remaining_expected_cancellations = int(gbis_remaining_pre_cancellations - gbis_remaining) gbis_remaining_expected_cancellations_revenue = gbis_remaining_expected_cancellations * gbis_rate # GBIS delta if original_warmfront_remaining_gbis == 0: gbis_delta_vs_original_estimate_remaining = "N/A" else: gbis_delta_vs_original_estimate_remaining = ( (gbis_remaining - original_warmfront_remaining_gbis) / original_warmfront_remaining_gbis ) # Current sales figures # For any sales surveys that are complete, that could still cancel, we apply a conversion rate eco4_actually_sold = 0 eco4_confirmed_cancellations = 0 eco4_expected_cancellations = 0 gbis_actually_sold = 0 gbis_confirmed_cancellations = 0 gbis_expected_cancellations = 0 if not survey_list.empty: surveys_with_eligibility = survey_list.merge( asset_list[["asset_list_row_id", "ECO Eligibility"]], how="left", on="asset_list_row_id" ) completed_eco4_sales = surveys_with_eligibility[ surveys_with_eligibility["installation_status"] == "ECO4 - installed" ].shape[0] incomplete_eco4_sales = surveys_with_eligibility[ (surveys_with_eligibility["installation_status"] == "ECO4 - in progress") & (~surveys_with_eligibility["ECO Eligibility"].isin( ["eco4 - passed ciga"]) ) ].shape[0] incomplete_eco4_sales_ciga = surveys_with_eligibility[ (surveys_with_eligibility["installation_status"] == "ECO4 - in progress") & (surveys_with_eligibility["ECO Eligibility"].isin( ["eco4 - passed ciga"]) ) ].shape[0] eco4_confirmed_cancellations = surveys_with_eligibility[ surveys_with_eligibility["installation_status"] == "ECO4 - cancelled" ].shape[0] expected_eco4_sales_no_ciga = np.round(incomplete_eco4_sales * ha_eco4_to_sale_rate) expected_eco4_sales_ciga = np.round(incomplete_eco4_sales_ciga * ha_ciga_pass_to_sale_rate) eco4_expected_cancellations = (incomplete_eco4_sales + incomplete_eco4_sales_ciga) - ( expected_eco4_sales_no_ciga + expected_eco4_sales_ciga ) eco4_expected_cancellations = int(np.round(eco4_expected_cancellations)) eco4_actually_sold = eco4_rate * ( completed_eco4_sales + expected_eco4_sales_no_ciga + expected_eco4_sales_ciga ) completed_gbis_sales = surveys_with_eligibility[ surveys_with_eligibility["installation_status"] == "GBIS - installed" ].shape[0] incomplete_gbis_sales = surveys_with_eligibility[ (surveys_with_eligibility["installation_status"] == "GBIS - in progress") ].shape[0] # Get confirmed cancellations gbis_confirmed_cancellations = surveys_with_eligibility[ surveys_with_eligibility["installation_status"] == "GBIS - cancelled" ].shape[0] expected_gbis_unconfirmed_sales = np.round(incomplete_gbis_sales * ha_gbis_sale_conversion) gbis_expected_cancellations = int(incomplete_gbis_sales - expected_gbis_unconfirmed_sales) gbis_actually_sold = completed_gbis_sales * gbis_rate + ( expected_gbis_unconfirmed_sales * gbis_rate ) # Add in the variance: # We should expect that the pre-ciga total is: # 1) The number of post CIGA successes + # 2) The number of archetype failures + # 2) the number of CIGA failures + # 3) The number of cancellations variance_total = eco4_pre_ciga - ( eco4_post_ciga_total_results["ECO4 - post CIGA - #"] + eco4_post_ciga_total_results["Estimated total - failed archetype check - #"] + eco4_post_ciga_total_results['Estimated total - failed CIGA'] + eco4_post_ciga_total_results["Expected cancellations - #"] ) if variance_total != 0: raise ValueError("Something went wrong in variance total") variance_remaining = eco4_pre_ciga_remaining - ( eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] + eco4_post_ciga_remaining_results["Estimated total - failed archetype check - #"] + eco4_post_ciga_remaining_results['Estimated total - failed CIGA'] + eco4_post_ciga_remaining_results["Expected cancellations - #"] ) if variance_remaining != 0: raise ValueError("Something went wrong in variance remaining") # We also check variances to make sure that the pre-CIGA ECO4 total equals # 1) Pre CIGA remaining + # 2) ECO4 sold + # 3) ECO4 confirmed cancellations + # 4) ECO4 unconfirmed cancellations pre_ciga_eco4_variance = ( eco4_pre_ciga_revenue - eco4_pre_ciga_remaining_revenue - eco4_actually_sold - eco4_confirmed_cancellations * eco4_rate - eco4_expected_cancellations * eco4_rate ) if pre_ciga_eco4_variance != 0: raise ValueError("Something went wrong in pre_ciga_eco4_variance") # Check GBIS total variance # The total before cancellations should equal: # The number of sold + # The number of confirmed cancelled + # The number of expected cancelled + # The number of remaining gbis_variance = gbis_total_pre_cancellations - ( gbis_actually_sold / gbis_rate + gbis_confirmed_cancellations + gbis_expected_cancellations + gbis_remaining_pre_cancellations ) if gbis_variance != 0: raise ValueError("Something went wrong in gbis_variance") # We expect the remaining to equal expected sales + expected cancellations gbis_variance_2 = gbis_remaining_pre_cancellations - ( gbis_remaining + gbis_remaining_expected_cancellations ) if gbis_variance_2 != 0: raise ValueError("Something went wrong in gbis_variance2") # Update the GBIS sold, since Warmfront often sold more GBIS that expected original_warmfront_gbis_revenue = original_warmfront_sold_gbis + original_warmfront_remaining_gbis_revenue original_warmfront_gbis = ( original_warmfront_sold_gbis / gbis_rate + original_warmfront_remaining_gbis_revenue / gbis_rate ) to_append = { ("", "", "", "HA Name"): ha_name, # ECO4 - original warmfront figures ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4, ("ECO4 original", "", "Remaining - #", ""): original_warmfront_remaining_eco4, ("ECO4 original", "", "Total - £", ""): original_warmfront_eco4_revenue, ("ECO4 original", "", "Sold or cancelled - £", ""): original_warmfront_sold_eco4, ("ECO4 original", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue, # GBIS - original warmfront figures ("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis, ("GBIS original", "", "Remaining - #", ""): original_warmfront_gbis, ("GBIS original", "", "Total - £", ""): original_warmfront_gbis_revenue, ("GBIS original", "", "Sold or cancelled - £", ""): original_warmfront_sold_gbis, ("GBIS original", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue, # ECO4 - asset list, pre-ciga ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga, ("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining, ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL", ""): pre_ciga_eco4_variance, ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL VS ELIGIBLE & INELIGIBLE", ""): variance_total, ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 REMAINING VS ELIGIBLE & INELIGIBLE", ""): variance_remaining, ("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold, ("ECO4 pre-ciga", "", "Confirmed cancellations - £", ""): eco4_confirmed_cancellations * eco4_rate, # This is for jobs that are in-progress and could still cancel ("ECO4 pre-ciga", "", "Unconfirmed cancellations - £", ""): eco4_expected_cancellations * eco4_rate, # ECO4 - asset list, post ciga, total ("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total"): eco4_post_ciga_total_results[ "ECO4 - post CIGA - #"], ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_total_results[ "ECO4 - post CIGA - £"], # ECO4 - asset list, post ciga, remaining ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining_results[ "ECO4 - post CIGA - #"], ("ECO4 post-ciga", "", "Estimated remaining eligible - £", ""): eco4_post_ciga_remaining_results[ "ECO4 - post CIGA - £"], ("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", ""): eco4_delta_vs_original_estimate_remaining, ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - #", ""): eco4_post_ciga_remaining_results["Of which confirmed - #"], ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - £", ""): eco4_post_ciga_remaining_results["Of which confirmed - £"], ("ECO4 post-ciga", "", "Of which forecast - #", ""): eco4_post_ciga_remaining_results["Of which forecast - #"], ("ECO4 post-ciga", "", "Of which forecast - £", ""): eco4_post_ciga_remaining_results["Of which forecast - £"], # Expected ECO4 cancellations ("ECO4 Cancellations", "", "Of which expected cancellations - #", ""): eco4_post_ciga_remaining_results[ "Expected cancellations - #" ], ("ECO4 Cancellations", "", "Of which expected cancellations - £", ""): eco4_post_ciga_remaining_results[ "Expected cancellations - £" ], # Archetype check failures ("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - #", ""): eco4_post_ciga_remaining_results['Estimated total - failed archetype check - #'], ("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - £", ""): eco4_post_ciga_remaining_results['Estimated total - failed archetype check - £'], # CIGA failures ("ECO4 CIGA failures", "", "Estimated total - failed CIGA - #", ""): eco4_post_ciga_remaining_results[ 'Estimated total - failed CIGA' ], ("ECO4 CIGA failures", "", "Estimated total - failed CIGA - £", ""): eco4_post_ciga_remaining_results[ 'Estimated total - failed CIGA - £' ], ("ECO4 CIGA failures", "", "Confirmed failures - #", ""): eco4_post_ciga_remaining_results[ "Confirmed CIGA failures" ], ("ECO4 CIGA failures", "", "Confirmed failures - £", ""): eco4_post_ciga_remaining_results[ "Confirmed CIGA failures - £" ], ("ECO4 CIGA failures", "", "Estimated failures - #", ""): eco4_post_ciga_remaining_results[ "Estimated CIGA failures" ], ("ECO4 CIGA failures", "", "Estimated failures - £", ""): eco4_post_ciga_remaining_results[ "Estimated CIGA failures - £" ], # GBIS postcode list ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total_pre_cancellations, ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_pre_cancellations_revenue, ("GBIS Postcode list", "Warmfront post code list", "GBIS VARIANCE", "GBIS total"): gbis_variance, ("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold, ("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations * gbis_rate, # This is for jobs that are in-progress and could still cancel ("GBIS Postcode list", "", "Unconfirmed cancellations - £", ""): gbis_expected_cancellations * gbis_rate, ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"): gbis_remaining_pre_cancellations, ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"): gbis_remaining_pre_cancellations_revenue, ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", ""): gbis_delta_vs_original_estimate_remaining, # Expected cancellations ( "GBIS Postcode list", "", "Of which expected sales - £ - £", "GBIS total"): gbis_remaining_revenue, ("GBIS Postcode list", "", "Of which expected cancellations -£", "GBIS total"): gbis_remaining_expected_cancellations_revenue } # Make sure nothing is forgotten due to duplicate multi-index keys if len(to_append) != 51: raise ValueError("Something went wrong") results.append(to_append) results = pd.DataFrame(results) results.to_csv("pipeline_remaining_raw.csv") totals_row = {} for col in results.columns: if col == ('', '', '', 'HA Name'): totals_row[col] = "Total" elif col in [ ("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", ""), ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "") ]: totals_row[col] = None else: totals_row[col] = results[col].sum() # For the delta columns, we calculate the delta on the totals totals_row[("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", "")] = ( ( totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")] - totals_row[("ECO4 original", "", "Remaining - #", "")] ) / totals_row[("ECO4 original", "", "Remaining - #", "")] ) totals_row[("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "")] = ( ( totals_row[("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total")] - totals_row[("GBIS original", "", "Remaining - #", "")] ) / totals_row[("GBIS original", "", "Remaining - #", "")] ) blank_row = pd.DataFrame([{col: "" for col in results.columns}]) # Put together a Warmfront original remaining ECO4 vs asset list remaining ECO4 and same for GBIS, as well as totals # ECO4 Headlines headline_eco4_original_remaining = totals_row[("ECO4 original", "", "Remaining - #", "")] headline_eco4_original_remaining_revenue = totals_row[("ECO4 original", "", "Remaining - £", "")] headline_eco4_postcode_list_remaining = totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")] headline_eco4_postcode_list_remaining_revenue = totals_row[ ("ECO4 post-ciga", "", "Estimated remaining eligible - £", "") ] headline_eco4_delta = 100 * ( (headline_eco4_postcode_list_remaining - headline_eco4_original_remaining) / headline_eco4_original_remaining ) headline_eco4_delta = round(headline_eco4_delta, 1) # GBIS Headlines headline_gbis_original_remaining = totals_row[("GBIS original", "", "Remaining - #", "")] headline_gbis_original_remaining_revenue = totals_row[("GBIS original", "", "Remaining - £", "")] headline_gbis_postcode_list_remaining = totals_row[ ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total") ] headline_gbis_postcode_list_remaining_revenue = totals_row[ ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total") ] headline_gbis_delta = 100 * ( (headline_gbis_postcode_list_remaining - headline_gbis_original_remaining) / headline_gbis_original_remaining ) headline_gbis_delta = round(headline_gbis_delta, 1) headline_original_total_revenue_remaining = ( headline_eco4_original_remaining_revenue + headline_gbis_original_remaining_revenue ) headline_postcode_list_total_revenue_remaining = ( headline_eco4_postcode_list_remaining_revenue + headline_gbis_postcode_list_remaining_revenue ) headline_total_delta = 100 * ( (headline_postcode_list_total_revenue_remaining - headline_original_total_revenue_remaining) / headline_original_total_revenue_remaining ) headline_total_delta = round(headline_total_delta, 1) headline_eco4_sold_since_november = ( totals_row[('ECO4 pre-ciga', '', 'Sold - £', '')] + totals_row[('ECO4 pre-ciga', '', 'Confirmed cancellations - £', '')] + # confirmed canclleations totals_row[('ECO4 pre-ciga', '', 'Unconfirmed cancellations - £', '')] - # expected cancellations totals_row[('ECO4 original', '', 'Sold or cancelled - £', '')] ) headline_gbis_sold_since_november = ( totals_row[("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total")] + totals_row[("GBIS Postcode list", "", "Confirmed cancellations - £", "")] + # confirmed cancellations totals_row[("GBIS Postcode list", "", "Unconfirmed cancellations - £", "")] - # expected cancellations totals_row[('GBIS original', '', 'Sold or cancelled - £', '')] ) headlines = [ { ("", "", "", "HA Name"): "Headlines", }, { ("", "", "", "HA Name"): "ECO4 Remaining - November - #", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_original_remaining }, { ("", "", "", "HA Name"): "ECO4 Remaining - November - £", ( "", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_original_remaining_revenue }, { ("", "", "", "HA Name"): "ECO4 Sold or cancelled since November - £", ( "", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_sold_since_november }, { ("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - #", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_postcode_list_remaining }, { ("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - £", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_postcode_list_remaining_revenue }, { ("", "", "", "HA Name"): "ECO4 £ remaining delta - %", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_eco4_delta) + "%" }, { ("", "", "", "HA Name"): "GBIS Remaining - November - #", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_original_remaining }, { ("", "", "", "HA Name"): "GBIS Remaining - November - £", ( "", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_original_remaining_revenue }, { ("", "", "", "HA Name"): "GBIS Sold or cancelled since November - £", ( "", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_sold_since_november }, { ("", "", "", "HA Name"): "GBIS Remaining - post code list - #", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_postcode_list_remaining }, { ("", "", "", "HA Name"): "GBIS Remaining - post code list - £", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_postcode_list_remaining_revenue }, { ("", "", "", "HA Name"): "GBIS delta %", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_gbis_delta) + "%" }, # Total revenue { ("", "", "", "HA Name"): "Total Remaining - November - £", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_original_total_revenue_remaining }, { ("", "", "", "HA Name"): "Total Remaining - post code list (post CIGA) - £", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_postcode_list_total_revenue_remaining }, { ("", "", "", "HA Name"): "Total Remaining delta %", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_total_delta) + "%" }, ] assumptions = [ { ("", "", "", "HA Name"): "Assumptions", }, { ("", "", "", "HA Name"): "ECO4 rate", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(eco4_rate) }, { ("", "", "", "HA Name"): "GBIS rate", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(gbis_rate) }, { ("", "", "", "HA Name"): "Median CIGA pass rate", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( round(median_ciga_success_rate * 100, 1)) + "%", }, { ("", "", "", "HA Name"): "Maximum allowed CIGA pass rate", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( round(maximum_ciga_conversion * 100, 1)) + "%", ("ECO4 original", "", "Remaining - #", ""): "- Maximum allowed CIGA conversion for HAs without CIGA checks We do not allow above this to be " "conservative" }, { ("", "", "", "HA Name"): "Median ECO4 (no CIGA) sales conversion rate", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( round(median_eco4_to_install * 100, 1)) + "%", ("ECO4 original", "", "Remaining - #", ""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check. Surveys that resulted " "in cancelled install are excluded." }, { ("", "", "", "HA Name"): "Median ECO4 (subect to CIGA) sales conversion rate", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( round(median_ciga_pass_to_install * 100, 1)) + "%", ("ECO4 original", "", "Remaining - #", ""): " - Sales conversion rate for a ECO4 property that passed a CIGA check. Surveys that resulted in " "cancelled installs are excluded." } ] results = pd.concat( [ results, pd.DataFrame([totals_row]), blank_row, pd.DataFrame(headlines), blank_row, blank_row, pd.DataFrame(assumptions) ] ) with open("HA Remaining Analysis.csv", "w", newline="") as file: # Write the DataFrame data without the index (adjust if you want the index). results.to_csv(file, header=True, index=False) def fml_data_pull(loader): has_bruh = [ "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", "HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20", ] # Can't pull from EPC database because it's based in Scotland # "HAXXX", "HAXX" # DO from backend.SearchEpc import SearchEpc epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=" failed_has = [] for ha in has_bruh: print(f"Pulling data for {ha}") try: asset_list = loader.data[ha]["asset_list"].copy() # properties found as eligibile fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"] # For each property, search for the latest EPC epc_data = [] for _, row in tqdm(fml.iterrows(), total=fml.shape[0]): property_type, _ = get_property_type_and_built_form(property_meta=row, ha_name=ha) if ha == "HAXXX": to_join = [str(x) for x in [row["Door Number"], row["Address Line 1"], row["Address Line 2"], row["Address Line 3"], row["Postcode"]] if x is not None] full_address = ", ".join(to_join) else: full_address = row["matching_address"] searcher = SearchEpc( address1=str(row["HouseNo"]), postcode=row["matching_postcode"], auth_token=epc_api_key, os_api_key="", property_type=property_type, full_address=full_address, fast=True ) # Force the skipping of estimating the EPC searcher.ordnance_survey_client.property_type = None searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) if searcher.newest_epc is None: continue epc = { "asset_list_row_id": row["asset_list_row_id"], **searcher.newest_epc.copy() } epc_data.append(epc) # Remove None entries epc_data = [x for x in epc_data if x is not None] # Save the data in S3 as a parquet epc_data_df = pd.DataFrame(epc_data) save_pickle_to_s3( data=epc_data_df, bucket_name="retrofit-datalake-dev", s3_file_name=f"ha-analysis/revised/{ha}/epc_data.pickle" ) except Exception as e: failed_has.append(ha) def extract_lower_bound(age_band): if pd.isna(age_band): return 1930 try: return int(age_band.split(':')[1].split('-')[0].strip()) except (ValueError, IndexError): return 1930 def classify_loft(x): # high confidence if float(x["roof_insulation_thickness"]) <= 100: return "high" if float(x["roof_insulation_thickness"]) <= 200: return "medium" if float(x["roof_insulation_thickness"]) <= 270 and x["epc_age"] >= 5 * 365: return "medium" return "unlikely" def fml_analysis(loader): # In the case of the optimistic scenario, we assume that the at-risk pipeline is still viable, just at a lower rate optimistic_scenario_rate = 1500 assumed_ciga_pass_rate = 0.731 has_bruh = [ "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", "HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20", ] no_ciga_cavity_descriptions = [ "Cavity wall, as built, insulated (assumed)", "Cavity wall, as built, no insulation (assumed)", "Cavity wall, as built, partial insulation (assumed)", "Cavity wall, no insulation (assumed)", "Cavity wall, partial insulation (assumed)", "Cavity wall,", "Cavity wall, insulated (assumed)", "Cavity wall, no insulation (assumed)", "Cavity wall, as built, insulated (assumed)", "Cavity wall, partial insulation (assumed)", ] # TODO: There will be some properties that are subject to CIGA that do not look like they ned a CIGA check! pass # them! Non-invasices will have checked the wall though results = [] wall_descriptions = [] for ha_name in tqdm(has_bruh): original_figures = loader.december_figures[ loader.december_figures["HA Name"] == ha_name ].copy() original_remaining = original_figures["ECO4 remaining"].values[0] original_gbis_remaining = original_figures["GBIS remaining"].values[0] # Read in the epc data asset_list = loader.data[ha_name]["asset_list"].copy() # properties found as eligibile fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"] epc_data = read_pickle_from_s3( bucket_name="retrofit-datalake-dev", s3_file_name=f"ha-analysis/revised/{ha_name}/epc_data.pickle" ) # We make sure we don't have duplicated. We do a super basic drop duplicates because it shouldn't be a huge # issue at this point epc_data = epc_data.drop_duplicates("uprn") wall_descriptions.extend(epc_data["walls-description"].unique().tolist()) # time from the inspection to now epc_data["epc_age"] = (datetime.now() - pd.to_datetime(epc_data["inspection-date"])).dt.days if "estimated" not in epc_data.columns: # For all after HA7, we don't use estimated surveys epc_data["estimated"] = False fuck_this = fml.merge( epc_data, how="left", on="asset_list_row_id" ) fuck_this["estimated"] = fuck_this["estimated"].fillna(True) if fuck_this.shape[0] != fml.shape[0]: raise Exception("What the fuck bruv") # Take just remaining if not loader.data[ha_name]["survey_list"].empty: survey_list = ( loader.data[ha_name]["survey_list"][ ~pd.isnull(loader.data[ha_name]["survey_list"]["asset_list_row_id"]) ] ) fuck_this = fuck_this.merge( survey_list[["asset_list_row_id", "installation_status"]], how="left", on="asset_list_row_id" ) # Anything that has an installation has gone to installation, and therefore is not remaining fuck_this = fuck_this[pd.isnull(fuck_this["installation_status"])] fuck_this = fuck_this.drop(columns=["installation_status"]) insulation_thicknesses = [] for _, x in fuck_this.iterrows(): if pd.isnull(x["roof-description"]): continue if x["roof-description"] == "SAP05:Roof": continue thickness = RoofAttributes(x["roof-description"]).process()["insulation_thickness"] # If there is a + in the thickness, strip it out thickness = str(thickness).replace("+", "") insulation_thicknesses.append( {'uprn': x["uprn"], "roof_insulation_thickness": thickness} ) insulation_thicknesses = pd.DataFrame(insulation_thicknesses) before_merge_shape = fuck_this.shape[0] fuck_this = fuck_this.merge(insulation_thicknesses, how="left", on="uprn") if fuck_this.shape[0] != before_merge_shape: raise Exception("SOMETHING WENT WRONG") # Automated archetype check if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")): # We perform the archetype test. If the property is a house, we it needs to be detached, semi-detached # or end terrace. If it's a bungalow, it must be attached fuck_this["passes_archetype"] = None fuck_this["passes_archetype"] = np.where( (fuck_this["property-type"] == "House") & (fuck_this["built-form"].isin(["Semi-Detached", "End-Terrace", "Detached"])), True, fuck_this["passes_archetype"] ) fuck_this["passes_archetype"] = np.where( (fuck_this["property-type"] == "Bungalow") & (fuck_this["built-form"].isin(["Detached"])), True, fuck_this["passes_archetype"] ) fuck_this["ECO Eligibility"] = np.where( (fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") & (fuck_this["passes_archetype"] == True), "eco4 (subject to ciga)", fuck_this["ECO Eligibility"] ) # If failed the archetype check and needs a CIGA, it's not eligibile fuck_this["ECO Eligibility"] = np.where( (fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") & (fuck_this["passes_archetype"] != True), "not eligible", fuck_this["ECO Eligibility"] ) fuck_this["ECO Eligibility"] = np.where( (fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") & (fuck_this["passes_archetype"] == True), "eco4", fuck_this["ECO Eligibility"] ) fuck_this["ECO Eligibility"] = np.where( (fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") & (fuck_this["passes_archetype"] != True), "gbis", fuck_this["ECO Eligibility"] ) if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")): raise Exception("DO THE DAMN ARCHETYPE CHECK BRO") # clean roof insulation fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0") fuck_this["roof_insulation_thickness"] = fuck_this[ "roof_insulation_thickness" ].str.replace("below average", "50") fuck_this["roof_insulation_thickness"] = fuck_this[ "roof_insulation_thickness" ].str.replace("None", "0") fuck_this["roof_insulation_thickness"] = fuck_this[ "roof_insulation_thickness" ].str.replace("none", "0") fuck_this["roof_insulation_thickness"] = fuck_this[ "roof_insulation_thickness" ].str.replace("average", "150") fuck_this["roof_insulation_thickness"] = fuck_this[ "roof_insulation_thickness" ].str.replace("above 150", "150") fuck_this["roof_classiciation"] = fuck_this.apply(lambda x: classify_loft(x), axis=1) had_survey = fuck_this[fuck_this["estimated"] == False] # proportion with a survey: proportion_with_survey = 100 * had_survey.shape[0] / fuck_this.shape[0] # Let's look just at the ECO4 business # For things that had a survey, take the properties that didn't need a CIGA check no_ciga_check_needed = had_survey[ had_survey["ECO Eligibility"] == "eco4" ] no_ciga_check_needed_eligible = no_ciga_check_needed[ (no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) & (no_ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] # For anything not needing a CIGA check, some of it will be GBIS no_ciga_check_needed_eligible_gbis = no_ciga_check_needed[ (no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) & (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) & (~no_ciga_check_needed["asset_list_row_id"].isin(no_ciga_check_needed_eligible["asset_list_row_id"].values)) ] # Characterise no CIGA check needed # !!!!!!!!!!!! AT RISK !!!!!!!!!!!! ciga_check_passed = had_survey[had_survey["ECO Eligibility"] == "eco4 - passed ciga"] # These should be treated the same as one that have passed their ciga checks, from a detection perspective ciga_check_passed_eligible = ciga_check_passed[ (ciga_check_passed["walls-description"].str.lower().str.contains("cavity") == True) & (ciga_check_passed["roof_classiciation"].isin(["high", "medium"])) & (ciga_check_passed["current-energy-efficiency"].astype(float) <= 80) ] if not loader.data[ha_name]["ciga_list"].empty: proportions = loader.data[ha_name]["ciga_list"]["Guarantee"].value_counts(normalize=True) ha_ciga_pass_rate = proportions[proportions.index == "No"].values[0] else: ha_ciga_pass_rate = assumed_ciga_pass_rate # We take just the cavity walls # UCL paper: https://discovery.ucl.ac.uk/id/eprint/10110371/ # This paper is based on London properties # The proportion of EPCs with building characteristics errors are shown to # differ between variables; floor and wall type errors occur in ~10-15% of EPCs, # compared with ~5% for wall insulation and glazing performance ciga_check_needed = had_survey[ had_survey["ECO Eligibility"].str.contains("subject to ciga") ].copy() ciga_check_needed_eligible = ciga_check_needed[ (ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) & (ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] # Finally, characterise gbis properties. Some of the business might look like ECO4 work, whereas we then # qualify what actually looks like gbis gbis_identified = had_survey[ had_survey["ECO Eligibility"] == "gbis" ].copy() gbis_looks_like_eco4 = gbis_identified[ (gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) & (gbis_identified["roof_classiciation"].isin(["high", "medium"])) & (gbis_identified["current-energy-efficiency"].astype(float) <= 80) & ( ( (gbis_identified["property-type"] == "House") & (gbis_identified["built-form"] != "Mid-Terrace") ) | ( (gbis_identified["property-type"] == "Bungalow") & (gbis_identified["built-form"].isin(["Detached"])) ) ) ] gbis_qualified = gbis_identified[ (gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) & (gbis_identified["current-energy-efficiency"].astype(float) <= 80) & (~gbis_identified["asset_list_row_id"].isin(gbis_looks_like_eco4["asset_list_row_id"].values)) ] ciga_check_expectation = np.round(ciga_check_needed_eligible.shape[0] * ha_ciga_pass_rate) without_ciga_expectation = no_ciga_check_needed_eligible.shape[0] passed_ciga_expectation = ciga_check_passed_eligible.shape[0] identified_as_gbis_looks_like_eco4 = gbis_looks_like_eco4.shape[0] # Need to add on the non-ciga total_eco4_expectation = ( ciga_check_expectation + without_ciga_expectation + passed_ciga_expectation + identified_as_gbis_looks_like_eco4 ) # This is the work that is at risk eco4_work_at_risk = ( passed_ciga_expectation + ciga_check_expectation ) no_ciga_check_needed_actually_gbis = no_ciga_check_needed_eligible_gbis.shape[0] gbis_qualified = gbis_qualified.shape[0] total_gbis_expectation = no_ciga_check_needed_actually_gbis + gbis_qualified if proportion_with_survey < 100: # We estimate the rest without_survey_needing_ciga = fuck_this[ (fuck_this["estimated"] == True) & (fuck_this["ECO Eligibility"].str.contains("subject to ciga") == True) ] if without_survey_needing_ciga.empty: without_survey_without_ciga_expected = 0 else: # We apply the same conversion rate as the properties with a survey if ciga_check_needed.shape[0] == 0 and ciga_check_expectation == 0: without_survey_without_ciga_expected = without_survey_needing_ciga.shape[0] else: without_survey_without_ciga_expected = np.round( without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0]) ) without_survey_passed_ciga = fuck_this[ (fuck_this["estimated"] == True) & (fuck_this["ECO Eligibility"] == "eco4 - passed ciga") ] if without_survey_passed_ciga.empty: without_survey_passed_ciga_expected = 0 else: # We apply the same conversion rate as the properties with a survey without_survey_passed_ciga_expected = np.round( without_survey_passed_ciga.shape[0] * (passed_ciga_expectation / ciga_check_passed.shape[0]) ) # Finally, no ciga needed without_survey_eco4 = fuck_this[ (fuck_this["estimated"] == True) & (fuck_this["ECO Eligibility"] == "eco4") ] if without_survey_eco4.empty: without_survey_eco4_expected = 0 without_survey_gbis_expected = 0 else: # We apply the same conversion rate as the properties with a survey without_survey_eco4_expected = np.round( without_survey_eco4.shape[0] * (without_ciga_expectation / no_ciga_check_needed.shape[0]) ) without_survey_gbis_expected = np.round( without_survey_eco4.shape[0] * (total_gbis_expectation / no_ciga_check_needed.shape[0]) ) # And gbis without_survey_gbis = fuck_this[ (fuck_this["estimated"] == True) & (fuck_this["ECO Eligibility"] == "gbis") ] if without_survey_gbis.empty: without_survey_identified_as_gbis_qualified = 0 without_survey_identified_as_gbis_eco4 = 0 else: # We apply the same conversion rate as the properties with a survey without_survey_identified_as_gbis_qualified = np.round( without_survey_gbis.shape[0] * (gbis_qualified / gbis_identified.shape[0]) ) without_survey_identified_as_gbis_eco4 = np.round( without_survey_eco4.shape[0] * (identified_as_gbis_looks_like_eco4 / gbis_identified.shape[0]) ) total_eco4_expectation = ( total_eco4_expectation + without_survey_without_ciga_expected + without_survey_passed_ciga_expected + without_survey_eco4_expected + without_survey_identified_as_gbis_eco4 ) total_gbis_expectation = ( total_gbis_expectation + without_survey_gbis_expected + without_survey_identified_as_gbis_qualified ) results.append( { "HA Name": ha_name, "Original ECO4 Estimate - Remaining": original_remaining, "Original GGBIS Estimate - Remaining": original_gbis_remaining, # "Postcode List - Remaining": postcode_list_remaining, # "Of which sold": sales_since_nov, "EPC verified ECO4 Eligible - Remaining": int(total_eco4_expectation), "EPC verified GBIS Eligibile - Remaining": int(total_gbis_expectation), # At risk work "Work at risk due to audits": eco4_work_at_risk } ) results_df = pd.DataFrame(results) results_df.to_csv("analysis - revised - audit update.csv") # results_df["Delta vs November"] = 100 * ( # results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"] # ) / results_df["Original ECO4 Estimate - Remaining"] # TODO: Add in estimated GBIS (for eco jobs, of which look like gbis) # TODO: Change the left hand side number for our post CIGA estimates def create_final_report(): """ This function will produce the final output for the HA analysis :return: """ epc_validated_results = pd.read_csv("analysis - revised - audit update.csv") pipeline_results = pd.read_csv("pipeline_remaining_raw.csv") #################################### # Original Warmfront estimates #################################### # Create the volumes result all_ha_summary_remaining = pipeline_results[ [ "('', '', '', 'HA Name')", "('ECO4 original', '', 'Remaining - #', '')", "('GBIS original', '', 'Remaining - #', '')", ] ].copy().rename( columns={ "('', '', '', 'HA Name')": "HA Name", "('ECO4 original', '', 'Remaining - #', '')": "# ECO4 remaining - All HA Summary", "('GBIS original', '', 'Remaining - #', '')": "# GBIS remaining - All HA Summary", } ) all_ha_summary_remaining["# Total remaining - All HA Summary"] = ( all_ha_summary_remaining["# ECO4 remaining - All HA Summary"] + all_ha_summary_remaining["# GBIS remaining - All HA Summary"] ) all_ha_summary_remaining = all_ha_summary_remaining.sort_values("HA Name") #################################### # Postcode list - pre-CIGA #################################### postcode_list_pre_ciga_remaining = pipeline_results[ [ "('', '', '', 'HA Name')", "('ECO4 pre-ciga', '', 'Remaining - #', '')", "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')", ] ].copy().rename( columns={ "('', '', '', 'HA Name')": "HA Name", "('ECO4 pre-ciga', '', 'Remaining - #', '')": "# ECO4 remaining - Postcode list (pre CIGA)", "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": ( "# GBIS remaining - Postcode list (pre CIGA)" ), } ) postcode_list_pre_ciga_remaining["# Total remaining - Postcode list (pre CIGA)"] = ( postcode_list_pre_ciga_remaining["# ECO4 remaining - Postcode list (pre CIGA)"] + postcode_list_pre_ciga_remaining["# GBIS remaining - Postcode list (pre CIGA)"] ) postcode_list_pre_ciga_remaining = postcode_list_pre_ciga_remaining.sort_values("HA Name") #################################### # Postcode list - post-CIGA #################################### postcode_list_post_ciga_remaining = pipeline_results[ [ "('', '', '', 'HA Name')", "('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')", "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')", ] ].copy().rename( columns={ "('', '', '', 'HA Name')": "HA Name", "('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')": "# ECO4 remaining - Postcode list (post CIGA)", "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": ( "# GBIS remaining - Postcode list (post CIGA)" ), } ) postcode_list_post_ciga_remaining["# Total remaining - Postcode list (post CIGA)"] = ( postcode_list_post_ciga_remaining["# ECO4 remaining - Postcode list (post CIGA)"] + postcode_list_post_ciga_remaining["# GBIS remaining - Postcode list (post CIGA)"] ) postcode_list_post_ciga_remaining = postcode_list_post_ciga_remaining.sort_values("HA Name") #################################### # From EPC Database #################################### from_epc_database = epc_validated_results[ [ "HA Name", "EPC verified ECO4 Eligible - Remaining", "EPC verified GBIS Eligibile - Remaining", "Work at risk due to audits" ] ].copy().rename( columns={ "EPC verified ECO4 Eligible - Remaining": "# ECO4 remaining - From EPC Database (post CIGA)", "EPC verified GBIS Eligibile - Remaining": "# GBIS remaining - From EPC Database (post CIGA)", "Work at risk due to audits": "ECO4 remaining work at risk due to Audits", } ) from_epc_database["# Total remaining - From EPC Database (post CIGA)"] = ( from_epc_database["# ECO4 remaining - From EPC Database (post CIGA)"] + from_epc_database["# GBIS remaining - From EPC Database (post CIGA)"] ) from_epc_database = from_epc_database.sort_values("HA Name") # Combine the datasets volumes = all_ha_summary_remaining.merge( postcode_list_pre_ciga_remaining, how="left", on="HA Name" ).merge( postcode_list_post_ciga_remaining, how="left", on="HA Name" ).merge( from_epc_database, how="inner", on="HA Name" ) revenue = volumes.copy() # Convert the ECO4 volumes to revenue for col in [ '# ECO4 remaining - All HA Summary', '# ECO4 remaining - Postcode list (pre CIGA)', '# ECO4 remaining - Postcode list (post CIGA)', '# ECO4 remaining - From EPC Database (post CIGA)', 'ECO4 remaining work at risk due to Audits' ]: revenue[col] = revenue[col] * 1710 # Convert the GBIS volumes to revenue for col in [ '# GBIS remaining - All HA Summary', '# GBIS remaining - Postcode list (pre CIGA)', '# GBIS remaining - Postcode list (post CIGA)', '# GBIS remaining - From EPC Database (post CIGA)' ]: revenue[col] = revenue[col] * 600 # Re-calculate the totals revenue['# Total remaining - All HA Summary'] = ( revenue['# ECO4 remaining - All HA Summary'] + revenue['# GBIS remaining - All HA Summary'] ) revenue['# Total remaining - Postcode list (pre CIGA)'] = ( revenue['# ECO4 remaining - Postcode list (pre CIGA)'] + revenue['# GBIS remaining - Postcode list (pre CIGA)'] ) revenue['# Total remaining - Postcode list (post CIGA)'] = ( revenue['# ECO4 remaining - Postcode list (post CIGA)'] + revenue[ '# GBIS remaining - Postcode list (post CIGA)'] ) revenue['# Total remaining - From EPC Database (post CIGA)'] = ( revenue['# ECO4 remaining - From EPC Database (post CIGA)'] + revenue['# GBIS remaining - From EPC Database (post CIGA)'] ) # Replace the # with £ in the columns revnue_colnames = [col.replace("#", "£") for col in revenue.columns] revenue.columns = revnue_colnames # We check that each column gets smaller decreasing_check1 = all( volumes["# ECO4 remaining - Postcode list (pre CIGA)"] >= volumes[ '# ECO4 remaining - Postcode list (post CIGA)'] ) if not decreasing_check1: raise ValueError("decreasing_check1 failed") # Just HA32 and HA17 should fail this, and it's due to GBIS jobs looking like ECO4 decreasing_check2 = volumes[volumes["# ECO4 remaining - From EPC Database (post CIGA)"] > volumes[ "# ECO4 remaining - Postcode list (post CIGA)"]] if set(decreasing_check2["HA Name"].tolist()) != {"HA17", "HA32"}: raise ValueError("decreasing_check2 failed") # Check for GBIS decreasing_check3 = all( volumes["# GBIS remaining - Postcode list (pre CIGA)"] >= volumes[ '# GBIS remaining - Postcode list (post CIGA)'] ) if not decreasing_check3: raise ValueError("decreasing_check3 failed") # Don't perform this - this happens for multiple # decreasing_check4 = volumes[volumes["# GBIS remaining - From EPC Database (post CIGA)"] > volumes[ # "# GBIS remaining - Postcode list (post CIGA)"]] # Store final outputs volumes.to_csv("HA Analysis - Audit Update - volumes.csv") revenue.to_csv("HA Analysis - Audit Update - revenue.csv") def identify_eco_works(loader): # ha_names = [ # "HA16", # For Housing # "HA39", # Rooftop # "HA41", # Settle # "HA23", # Lambeth # "HA14", # EMH # "HA7", # Believe # "HA102", # Thrive # ] # Unitas, fairhive, acis, LHP ha_names = [ "HA50", # Unitas "HA15", # Fairhive "HA107", # ACIS "HA24", # LHP ] names = { "HA50": "Unitas", "HA15": "Fairhive", "HA107": "ACIS", "HA24": "LHP" } # gbis rate breakdowns = [] # lists = {} for ha, data_assets in loader.data.items(): if ha not in ha_names: continue asset_list = data_assets["asset_list"].copy() survey_list = data_assets["survey_list"].copy() # Remove things that have sold if not survey_list.empty: asset_list = asset_list.merge( survey_list[["asset_list_row_id", "installation_status"]], how="left", on="asset_list_row_id" ) # Anything that has an installation has gone to installation, and therefore is not remaining asset_list = asset_list[pd.isnull(asset_list["installation_status"])] asset_list = asset_list.drop(columns=["installation_status"]) # Needing a CIGA check needs_cga = asset_list[ asset_list["ECO Eligibility"] == "eco4 (subject to ciga)" ].copy() eco4 = asset_list[ asset_list["ECO Eligibility"] == "eco4" ].copy() eco4_passed_ciga = asset_list[ asset_list["ECO Eligibility"] == "eco4 - passed ciga" ].copy() # lists[ha] = { # "needs_cga": needs_cga, # "eco4": eco4, # "eco4_passed_ciga": eco4_passed_ciga # } # Store the data if not needs_cga.empty: needs_cga.to_csv(f"local_data/{names[ha]} - needs ciga.csv") if not eco4.empty: eco4.to_csv(f"local_data/{names[ha]} - eco4.csv") if not eco4_passed_ciga.empty: eco4_passed_ciga.to_csv(f"local_data/{names[ha]} - eco4 passed ciga.csv") summary = { "HA Name": ha, "n_needing_ciga": needs_cga.shape[0], "eco4": eco4.shape[0], "eco4_passed_ciga": eco4_passed_ciga.shape[0] } breakdowns.append(summary) breakdowns = pd.DataFrame(breakdowns) breakdowns = breakdowns.fillna(0) def unitas_data_prep(loader): ##### # Adhoc - for UNITAS, stripping out additional surveys that have been completed unitas_data = loader.data["HA50"].copy() unitas_asset_list = unitas_data["asset_list"].copy() unitas_survey_sheet = unitas_data["survey_list"].copy() # We remove the surveyed properties from the asset sheet unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])] unitas_asset_list = unitas_asset_list.merge( unitas_survey_sheet[["asset_list_row_id", "installation_status"]], how="left", on="asset_list_row_id" ) unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])] unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"]) # We read in the data for the further completed surveys unitas_phase_1_workbook = openpyxl.load_workbook( "local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx" ) phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"] phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"] phase1_colnames = [cell.value for cell in phase_1_worksheet[1]] phase_1_rows_data = [] for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False): row_data = [cell.value for cell in row] # This will get you the cell values phase_1_rows_data.append(row_data) phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames) # Correct phase 1 surveys in the same fashion as the previous approach phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy()) # We check all phase 1 surveys are contained in the data we had before additional = [] for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)): # We look for the entry in the old survey sheet: # matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]] # if matched_uprn.shape[0] == 1: # continue matched_1 = unitas_survey_sheet[ (unitas_survey_sheet["Post Code"] == row["Post Code"]) & (unitas_survey_sheet["NO."] == row["NO."]) ] if matched_1.shape[0] == 1: continue matched_2 = unitas_survey_sheet[ (unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) & (unitas_survey_sheet["NO."] == row["NO."]) ] if matched_2.shape[0] == 1: continue additional.append(row.to_dict()) additional = pd.DataFrame(additional) phase_2_rows_data = [] for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False): row_data = [cell.value for cell in row] # This will get you the cell values phase_2_rows_data.append(row_data) phase2_colnames = [cell.value for cell in phase_2_worksheet[1]] phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames) # Drop all of the occurances of "OFFICE USE ONLY" columns phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c]) common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns}) additional_filtered = additional[common_columns] further_unitas_completed_surveys = pd.concat( [phase_2_surveys, additional_filtered], axis=0, ignore_index=True ) # Add a phase 2 key further_unitas_completed_surveys["survey_list_row_id"] = [ "unitas_phase_2" + str(i) for i in further_unitas_completed_surveys.index ] not_in_asset_list = [ "unitas_phase_20", "unitas_phase_234", "unitas_phase_2163", "unitas_phase_2173", "unitas_phase_2374" ] additional_postcodes = ["st28bg"] full_asset_list = unitas_data["asset_list"].copy() full_asset_list["matching_postcode"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "") further_unitas_completed_surveys["Post Code"] = further_unitas_completed_surveys["Post Code"].str.replace( "ST 5DT", "ST3 5DT" ) # We match these back to the asset list matching_lookup = [] for _, row in tqdm(further_unitas_completed_surveys.iterrows(), total=len(further_unitas_completed_surveys)): if row["survey_list_row_id"] in not_in_asset_list: continue postcode_lower = row["Post Code"].lower().strip().replace(" ", "") if postcode_lower in additional_postcodes: continue # Confirmed not in asset lsit # Filter asset list on postcode df = full_asset_list[ full_asset_list["matching_postcode"].str.contains(postcode_lower) ] df = df[df["HouseNo"] == str(row["NO."])] if df.shape[0] != 1: raise Exception("NOT FOUND") matching_lookup.append( { "survey_list_row_id": row["survey_list_row_id"], "asset_list_row_id": df["asset_list_row_id"].values[0], } ) matching_lookup = pd.DataFrame(matching_lookup) matching_lookup["phase_2_surveyed"] = True # We merge this onto the asset list and remove the rows unitas_asset_list = unitas_asset_list.merge( matching_lookup, how="left", on="asset_list_row_id" ) # Drop rows where phase_2_surveyed is populated unitas_asset_list = unitas_asset_list[ pd.isnull(unitas_asset_list["phase_2_surveyed"]) ] # We add in the new CIGA submissions unitas_round_2_ciga_workbook = openpyxl.load_workbook("local_data/ha_data/Unitas second round CIGA checks.xlsx") ciga_round_2_worksheet = unitas_round_2_ciga_workbook["Worksheet"] ciga_round_2_colnames = [cell.value for cell in ciga_round_2_worksheet[1]] round_2_rows_data = [] for row in ciga_round_2_worksheet.iter_rows(min_row=2, values_only=False): row_data = [cell.value for cell in row] # This will get you the cell values round_2_rows_data.append(row_data) ciga_round_2 = pd.DataFrame(round_2_rows_data, columns=ciga_round_2_colnames) # We merge the ciga sheet to the asset list ciga_dependent_asset_list = unitas_asset_list[ unitas_asset_list["ECO Eligibility"].str.contains("subject to ciga") ].copy() # We merge the ciga sheet to the asset list ciga_round_2_matched = ciga_dependent_asset_list.merge( ciga_round_2, how="inner", on=["Address Line 1", "Post Code"] ) # Filter on just the properties that had no guarantee ciga_round_2_matched = ciga_round_2_matched[ciga_round_2_matched["Guarantee"] == "No"] # ECO Eligibility # not eligible 9227 # failed ciga 2711 # eco4 (subject to ciga) 2238 # eco4 - passed ciga 901 # gbis 114 # eco4 91 # We filter on the properties we're looking to re-survey unitas_properties_to_survey = unitas_asset_list[ unitas_asset_list["ECO Eligibility"].isin( [ "eco4 - passed ciga", "eco4" ] ) ].copy() unitas_properties_to_survey = pd.concat( [ unitas_properties_to_survey, ciga_round_2_matched[unitas_properties_to_survey.columns] ] ) epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=" # We now retrieve the lastest EPC data epc_data = [] for _, unitas_property in tqdm(unitas_properties_to_survey.iterrows(), total=len(unitas_properties_to_survey)): property_type, _ = get_property_type_and_built_form(property_meta=unitas_property, ha_name="HA50") full_address = unitas_property["matching_address"] searcher = SearchEpc( address1=str(unitas_property["HouseNo"]), postcode=unitas_property["matching_postcode"], auth_token=epc_api_key, os_api_key="", property_type=property_type, full_address=full_address, fast=True ) # Force the skipping of estimating the EPC searcher.ordnance_survey_client.property_type = None searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) if searcher.newest_epc is None: continue epc = { "asset_list_row_id": unitas_property["asset_list_row_id"], **searcher.newest_epc.copy() } epc_data.append(epc) epc_df = pd.DataFrame(epc_data) # Pull out just the columns we need epc_df = epc_df[ [ "asset_list_row_id", "address1", "postcode", "current-energy-efficiency", "current-energy-rating", "inspection-date", "transaction-type", "built-form" ] ] epc_df["EPC Rating"] = ( epc_df["current-energy-efficiency"].astype(str) + epc_df["current-energy-rating"].astype(str) ) # Merge onto the Unitas data: unitas_properties_to_survey_full = unitas_properties_to_survey.merge( epc_df[ [ "asset_list_row_id", "EPC Rating", "inspection-date", "transaction-type", "built-form" ] ], how="left", on="asset_list_row_id" ) unitas_properties_to_survey_full["ECO Eligibility"] = unitas_properties_to_survey_full["ECO Eligibility"].replace( "eco4 (subject to ciga)", "eco4 - passed ciga, phase 2 check" ) for col in ["EPC Rating", "inspection-date", "transaction-type", "built-form"]: unitas_properties_to_survey_full[col] = np.where( pd.isnull(unitas_properties_to_survey_full[col]), "No EPC found", unitas_properties_to_survey_full[col] ) unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].fillna( "No EPC found" ) unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].astype(str) unitas_properties_to_survey_full = unitas_properties_to_survey_full.rename( columns={ "inspection-date": "Last EPC Inspection Date", "transaction-type": "Last EPC Reason", "built-form": "Last EPC Built Form", } ) # We now match to the survey outcomes unitas_survey_outcomes_workbook = openpyxl.load_workbook( "local_data/ha_data/UNITAS - survey outcomes 26.03.2024.xlsx" ) unitas_survey_outcomes_worksheet = unitas_survey_outcomes_workbook["OUTCOMES"] unitas_outcomes_colnames = [cell.value for cell in unitas_survey_outcomes_worksheet[2]] outcomes_rows_data = [] for row in unitas_survey_outcomes_worksheet.iter_rows(min_row=3, values_only=False): row_data = [cell.value for cell in row] # This will get you the cell values outcomes_rows_data.append(row_data) unitas_outcomes = pd.DataFrame(outcomes_rows_data, columns=unitas_outcomes_colnames) unitas_outcomes = unitas_outcomes.rename( columns={ "Notes (If 'no answer' under outcomes, have you checked around the property for access " "issues where possible?)": "Notes" } ) unitas_outcomes["Postcode"].unique() eg1 = unitas_properties_to_survey_full[ (unitas_properties_to_survey_full["Post Code"] == "ST6 6RF") ] eg1_outcomes = unitas_outcomes[ (unitas_outcomes["Postcode"] == "ST6 6RF") ] # Merge outcomes onto properties to survey. Will probably have to do algorithmically full_asset_list["matching_postcode_nospace"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "") outcome_matching = [] for _, outcome in tqdm(unitas_outcomes.iterrows(), total=len(unitas_outcomes)): # We search for the corresponding entry in the asset list postcode_lower = outcome["Postcode"].lower().strip().replace(" ", "") # Confirmed not in asset lsit # Filter asset list on postcode df = unitas_properties_to_survey_full[ unitas_properties_to_survey_full["matching_postcode_nospace"].str.contains(postcode_lower) ] df = df[df["HouseNo"] == str(outcome["No."])] if df.empty: continue if df.shape[0] == 1: outcome_matching.append( { "asset_list_row_id": df["asset_list_row_id"].values[0], **outcome.to_dict() } ) continue raise Exception("something went wrong") outcome_matching = pd.DataFrame(outcome_matching) # We can have duplicate matches, so we format the Date letter sent column and retrieve the newest outcome outcome_matching["Date letters sent"] = outcome_matching["Date letters sent"].str.lower() outcome_matching["Extracted Date"] = outcome_matching["Date letters sent"].str.extract( r'(?:w[./]c )(\d{2}\.\d{2}\.\d{4})') outcome_matching["Extracted Date"] = pd.to_datetime(outcome_matching["Extracted Date"], format='%d.%m.%Y') # We sort by asset_list_row_id and extracted date, and retrieve the newest outcome_matching = outcome_matching.sort_values(["asset_list_row_id", "Extracted Date"], ascending=[True, False]) # Some properties will have multiple outcomes - for these, we re-format outcome_matching_grouped = [] for asset_list_row_id, grouped_data in outcome_matching.groupby("asset_list_row_id"): if grouped_data.shape[0] == 1: outcome_matching_grouped.append( { "Number of previous visits": 1, **grouped_data.to_dict("records")[0] } ) continue if grouped_data.shape[0] == 2: newest_visit = grouped_data.head(1) oldest_visit = grouped_data.tail(1)[['Outcomes', 'Surveyor', 'Notes', 'Date letters sent']].add_suffix( " second visit") to_append = { "Number of previous visits": 2, **newest_visit.to_dict("records")[0], **oldest_visit.to_dict("records")[0] } outcome_matching_grouped.append(to_append) else: raise Exception("something went wrong") outcome_matching_grouped = pd.DataFrame(outcome_matching_grouped) unitas_properties_to_survey_with_outcomes = unitas_properties_to_survey_full.merge( outcome_matching_grouped, how="left", on="asset_list_row_id" ) unitas_properties_to_survey_with_outcomes["Number of previous visits"] = ( unitas_properties_to_survey_with_outcomes["Number of previous visits"].fillna(0) ) # Store as an excel unitas_properties_to_survey_with_outcomes.to_excel("Unitas - phase 2 properties to Survey.xlsx") unitas_properties_to_survey_with_outcomes["Last EPC Built Form"].value_counts() def app(): """ This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107. Only HA 6 has surveys :return: """ # Determines if we want to use the cached data in s3 use_cache = True # Determines if we want to perform the data pull pull_data = False # Override to re-build all inputs rebuild_inputs = False # List all of the data in the folder directories = [str(file) for entry in DATA_FOLDER.iterdir() if entry.is_dir() for file in entry.iterdir() if file.suffix == '.xlsx'] # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" # Add in: priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42", # Added as of March 18th "HA44", "HA45", "HA51", "HA52", "HA17", "HA5", "HA20", # New HAS "HAXX", "HAXXX", ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE], 31 [DONE], 54 [DONE] # # Consider for ECO4: # HA 70 - have to merge ECO3 list though, # HA17 has LOTs of assets, but the asset list is a mess # HA53 but has EPCs done # Consider for GBIS: # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs) loader.load() loader.ha_facts_and_figures() # import pickle # with open("ha_analysis_data_temp.pkl", "wb") as f: # pickle.dump(loader, f) # import pickle # with open("ha_analysis_data_temp.pkl", "rb") as f: # loader = pickle.load(f) forecast_remaining_sales(loader) # Functions to produce the final output lol... # fml_data_pull(loader) # If we need to pull EPC data fml_analysis(loader) create_final_report() # Adhoc - for HA16, get the properties that still need a CIGA check # asset_list_ha16 = loader.data["HA16"]["asset_list"].copy() # ha_16_need_ciga = asset_list_ha16[ # asset_list_ha16["ECO Eligibility"].str.contains("subject to ciga") # ] # completed_cigas = loader.data["HA16"]["ciga_list"].copy() # # Store the results # ha_16_need_ciga.to_csv("ha16_need_ciga.csv") # completed_cigas.to_csv("ha16_completed_cigas.csv") # # # Adhoc - look at the current pipeline and identify how many dormant, CIGA dependent properties there are for # # live projects # # # Read excel # orderbook_filepath = "local_data/ha_data/Warmfront HA client order book overview_20240129.xlsx" # orderbook_workbook = openpyxl.load_workbook(orderbook_filepath) # orderbook_sheet = orderbook_workbook["Contractual Info"] # orderbook_colnames = [cell.value for cell in orderbook_sheet[1]] # # rows = [] # for row in orderbook_sheet.iter_rows(min_row=2, values_only=False): # row_data = [cell.value for cell in row] # This will get you the cell values # rows.append(row_data) # # orderbook = pd.DataFrame(rows, columns=orderbook_colnames) # live_orderbook = orderbook[orderbook["Live, New, or Historic?"] == "LIVE"].copy() # live_orderbook['Redacted HA'] = live_orderbook['Redacted HA'].str.replace(" ", "") # # dormant_properties = [] # missed_has = [] # for _, customer in live_orderbook.iterrows(): # if customer['Redacted HA'] not in loader.data.keys(): # missed_has.append(customer['Redacted HA']) # continue # asset_list = loader.data[customer['Redacted HA']]["asset_list"].copy() # survey_list = loader.data[customer['Redacted HA']]["survey_list"].copy() # # Remove sold # if not survey_list.empty: # survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])] # asset_list = asset_list.merge( # survey_list[["asset_list_row_id", "installation_status"]], # how="left", # on="asset_list_row_id" # ) # # Anything that has an installation has gone to installation, and therefore is not remaining # asset_list = asset_list[pd.isnull(asset_list["installation_status"])] # asset_list = asset_list.drop(columns=["installation_status"]) # # # We pull out the properties that need a CIGA check # need_ciga = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to ciga)"] # need_archetype = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to archetype)"] # need_ciga_and_archetype = asset_list[ # asset_list["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)" # ] # # dormant_properties.append( # { # "HA Name": customer['Redacted HA'], # "Need CIGA": need_ciga.shape[0], # "Need Archetype": need_archetype.shape[0], # "Need CIGA and Archetype": need_ciga_and_archetype.shape[0] # } # ) # # dormant_properties = pd.DataFrame(dormant_properties) # totals = dormant_properties.sum() # totals["HA Name"] = "Total" # # dormant_properties = pd.concat([dormant_properties, totals.to_frame().T]) # dormant_properties.to_csv("dormant_properties.csv") # # loader.december_figures["ECO4 remaining"].sum() # december_figures = loader.december_figures.copy() # december_figures["ECO4 remaining"] = np.where( # december_figures["ECO4 remaining"] < 0, # 0, # december_figures["ECO4 remaining"] # ) # december_figures["ECO4 remaining"].sum()