Model/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
2025-04-13 21:39:35 +01:00

7330 lines
316 KiB
Python

import os
import re
import openpyxl
from fuzzywuzzy import fuzz
from pathlib import Path
import msgpack
from datetime import datetime
import pandas as pd
import numpy as np
from utils.s3 import (
read_from_s3, read_dataframe_from_s3_parquet, save_pickle_to_s3, read_pickle_from_s3, save_dataframe_to_s3_parquet
)
from utils.logger import setup_logger
from dotenv import load_dotenv
from tqdm import tqdm
from backend.SearchEpc import SearchEpc
from etl.eligibility.Eligibility import Eligibility
from etl.eligibility.ha_15_32.app import prepare_model_data_row
from backend.ml_models.api import ModelApi
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from recommendations.recommendation_utils import calculate_cavity_age
from etl.epc.Record import EPCRecord
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc.DataProcessor import EPCDataProcessor
from datetime import datetime
import inspect
src_file_path = inspect.getfile(lambda: None)
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
ENV_FILE = Path(src_file_path).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
DATA_FOLDER = Path(src_file_path).parent / "local_data" / "ha_data"
logger = setup_logger()
load_dotenv(ENV_FILE)
PROPERTY_TYPE_LOOKUP = {
"HA1": {
"built_form": {
'Mid Terrace': 'Mid-Terrace',
'Semi-Detached': 'Semi-Detached',
'End Terrace': 'End-Terrace',
'Detached': 'Detached',
'Enclosed Mid': 'Mid-Terrace',
'Detached Local Connect': 'Detached',
}
},
"HA2": {
'HOUSE': 'House',
'FLAT': 'Flat',
'SHELTERED': None,
'BUNGALOW': 'Bungalow',
'BED-SIT': None,
'MAISONETTE': "Maisonette",
'HOSTEL': None
},
"HA5": {
"House": "House",
"Flat": "Flat",
"Bungalow": "Bungalow",
"Bedsit": None
},
"HA6": {
"property_type": {
'HOUSE': "House",
'GROUND FLOOR FLAT': "Flat",
'UPPER FLOOR FLAT': "Flat",
'MAISONETTE': "Maisonette",
'BUNGALOW': "Bungalow",
'WARDEN BUNGALOW': "Bungalow",
'WARDEN FLAT': "Flat",
'EXTRACARE SCHEME': "Flat",
}
},
"HA7": {
"property_type": {
"House": "House",
"Flat": "Flat",
"Bungalow": "Bungalow",
"Maisonette": "Maisonette",
},
"built_form": {
"Semi Detached": "Semi-Detached",
"Mid Terrace": "Mid-Terrace",
"End Terrace": "End-Terrace",
"Detached": "Detached",
"End Terraced": "End-Terrace",
}
},
"HA8": {
"House": "House",
"Flat": "Flat",
"Bungalow": "Bungalow",
"Maisonette": "Maisonette",
"Bedsit": None,
"Room": None,
"Other": None,
"Commerical": None
},
"HA11": {
"Flat": "Flat",
"House": "House",
"Semi-Det House": "House",
"Bedsit": None,
"End-Terr House": "House",
"Mid-Terr House": "House",
"Bungalow": "Bungalow",
"Maisonette": "Maisonette",
"End Terr Flat": "Flat",
"Mid Terr Flat": "Flat",
"Detached Flat": "Flat",
},
"HA12": {
"House": "House",
"Flat": "Flat",
"Bungalow": "Bungalow",
"Maisonette": "Maisonette",
"Bedsit": None,
},
"HA13": {
'House': "House",
'Flat': "Flat",
'House MT': "House",
'House SD': "House",
'House ET': "House",
'Bungalow MT': "Bungalow",
'Bungalow ET': "Bungalow",
'ii': None,
},
"HA14": {
"property_type": {
"House": "House",
"Flat": "Flat",
"Bungalow": "Bungalow",
"Maisonette": "Maisonette",
}
},
"HA15": {
'House': 'House',
'Flat': 'Flat',
'Bungalow': 'Bungalow',
'Maisonette': 'Maisonette',
'Flat over garage': 'Flat',
},
"HA16": {
'Semi Detached Bungalow': {"property-type": "Bungalow", "built-form": "Semi-Detached"},
'Mid Terraced House': {"property-type": "House", "built-form": "Mid-Terrace"},
'End Terraced House': {"property-type": "House", "built-form": "End-Terrace"},
'Low Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"},
'Semi-Detached House': {"property-type": "House", "built-form": "Semi-Detached"},
'Detached Bungalow': {"property-type": "Bungalow", "built-form": "Detached"},
'End Terraced Bungalow': {"property-type": "Bungalow", "built-form": "End-Terrace"},
'Mid Terraced Bungalow': {"property-type": "Bungalow", "built-form": "Mid-Terrace"},
'Medium Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"},
'Detached House': {"property-type": "House", "built-form": "Detached"},
'Cottage Flat': {"property-type": "Flat", "built-form": "Semi-Detached"},
'Maisonette Medium Rise': {"property-type": "Flat", "built-form": "Mid-Terrace"},
'Maisonette Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"},
'End Terraced Town House': {"property-type": "House", "built-form": "End-Terrace"},
'Flat Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"},
'Mid Terraced Town House': {"property-type": "House", "built-form": "Mid-Terrace"},
},
"HA18": {
"House": "House",
"Flat": "Flat",
"Bungalow": "Bungalow",
"Maisonette": "Maisonette",
"Bedsit": None,
"Shop": None,
"Hostel": None,
"Block": None,
},
"HA20": {
"House": "House",
"Flat": "Flat",
'Sheltered Flat': "Flat",
'Maisonette': 'Maisonette',
'Bungalow': 'Bungalow',
'House. SD': 'House',
'House. MT': 'House',
'House. ET': 'House',
'Sheltered Bungalow': 'Bungalow',
'Guest Accomodation': None,
'Sheltered House': 'House',
'House. MT ': 'House',
'House. D': 'House'
},
"HA24": {
'01 HOUSE': 'House',
'02 FLAT': 'Flat',
'03 BUNGALOW': 'Bungalow',
'10 PBUNGALOW': 'Bungalow',
'01 HOUSE MID': 'House',
'13 SBUNGALOW': 'Bungalow',
'12 SBEDSIT': None, # BEDSIT does not match the specified property types
'14 SFLAT': 'Flat',
'05 BEDSIT': None,
'04 MAISONETTE': 'Maisonette',
'11 PFLAT': 'Flat',
'09 PBEDSIT': None
},
"HA25": {
'Flat': 'Flat',
'Mid Terrace House': 'House',
'Semi Detached House': 'House',
'End Terrace House': 'House',
'House': 'House',
'Semi Detached Bung': 'Bungalow',
'Bungalow': 'Bungalow',
'End Terrace Bungalow': 'Bungalow',
'Maisonnette': 'Maisonette',
'Mid Terrace Bungalow': 'Bungalow',
'Bedspace': None,
'Detached House': 'House',
'Bedsit': 'Flat',
'Coach House': 'House',
'Detached Bungalow': 'Bungalow',
'Office Buildings': None,
'Guest Room': None,
'Mid Terrace Housekeeping ': 'House',
'End Terrace Housex': 'House'
},
"HA28": {
'Flat': 'Flat',
'Semi detached house': 'House',
'Terraced house': 'House',
'Maisonette flat': 'Maisonette',
'Sheltered bedsit': None,
'APD flat': 'Flat',
'Bungalow terraced': 'Bungalow',
'Flat with partition': 'Flat',
'Bungalow semi detached': 'Bungalow',
'APD Bungalow': 'Bungalow',
'Sheltered flat': 'Flat',
'Bedsit Flat': 'Flat',
'Bedsit bungalow semi detached': 'Bungalow',
'Sheltered bungalow terraced': 'Bungalow',
'Sheltered bedsit disabled': None,
'Bedsit bungalow terraced': 'Bungalow',
'Sheltered bungalow semi detached': 'Bungalow',
'Sheltered warden flat': 'Flat',
'Bungalow detached': 'Bungalow',
'Block': None, # Does not match the specified property types
'End Terraced House': 'House',
'Mid Terraced House': 'House',
'#N/A': None, # Assuming this is an invalid or missing entry
0: None # Assuming 0 is also an invalid or missing entry
},
"HA30": {
'House': 'House',
'Flat': 'Flat',
'Bungalow': 'Bungalow',
'House with Attached Garage': 'House',
'Bed Space': None, # Assuming this does not fit the specified property types
'House with Garage': 'House',
'Bungalow with Wheelchair Access': 'Bungalow',
'Maisonette': 'Maisonette',
'Flat with Wheelchair Access': 'Flat',
'Bedsit': None, # Assuming this does not fit the specified property types
'Flat w Wheelchair Access & Car Park': 'Flat',
'House with Wheelchair Access': 'House',
'Bungalow w Wheelchair Access & Car ': 'Bungalow'
},
"HA32": {
'Bungalow': 'Bungalow',
'Flat': 'Flat',
'Bungalow Disabled': 'Bungalow', # "Disabled" properties categorized with their base type
'House': 'House',
'Dormer Bungalow': 'Bungalow',
'Pop-In': None, # Does not fit the specified property types
'Flat Disabled': 'Flat',
'Laundry': None, # Does not fit the specified property types
'Bedsit': None, # Excluded from the given categories
'Shed': None, # Does not fit the specified property types
'Store Room': None # Does not fit the specified property types
},
"HA34": {
'Flat': 'Flat',
'House': 'House',
'Bungalow': 'Bungalow',
'Maisonette': 'Maisonette',
'ND': None,
},
"HA35": {
"Flat": "Flat",
"Maisonette": "Maisonette",
"House": "House",
"Bedsit": None,
"2 Bedroom Unknown": None,
"1 Bedroom Unknown": None,
"3 Bedroom Unknown": None,
"4 Bedroom Unknown": None,
},
"HA37": {
"FLT": "Flat",
"HSE": "House",
"BNW": "Bungalow",
"MAS": "Maisonette",
"HSL": None
},
"HA39": {
"Semi house": {"property_type": "House", "built_form": "Semi-Detached"},
"1st floor flat": {"property_type": "Flat", "built_form": None},
"Mid terrace house": {"property_type": "House", "built_form": "Mid-Terrace"},
"Ground floor flat": {"property_type": "Flat", "built_form": None},
"End terrace house": {"property_type": "House", "built_form": "End-Terrace"},
"Semi bungalow": {"property_type": "Bungalow", "built_form": "Semi-Detached"},
"End terrace bungalow": {"property_type": "Bungalow", "built_form": "End-Terrace"},
"2nd floor flat": {"property_type": "Flat", "built_form": None},
"Mid terrace bungalow": {"property_type": "Bungalow", "built_form": "Mid-Terrace"},
"3rd floor flat": {"property_type": "Flat", "built_form": None},
"Detached bungalow": {"property_type": "Bungalow", "built_form": "Detached"},
"Maisonette": {"property_type": "Maisonette", "built_form": None},
"Detached house": {"property_type": "House", "built_form": "Detached"},
"Lower ground floor flat": {"property_type": "Flat", "built_form": None},
"Dormer bungalow": {"property_type": "Bungalow", "built_form": None},
"Basement flat": {"property_type": "Flat", "built_form": None},
"Cluster House": {"property_type": "House", "built_form": "Detached"},
"2nd/3rd floor duplex flat": {"property_type": "Flat", "built_form": None},
"Ground floor flat with study": {"property_type": "Flat", "built_form": None},
"4th floor flat": {"property_type": "Flat", "built_form": None},
"1st floor flat with study room": {"property_type": "Flat", "built_form": None},
"2nd floor flat with study": {"property_type": "Flat", "built_form": None},
},
"HA41": {
'Garage': None,
'House 1919-1945': 'House',
'House 1946-1964': 'House',
'Flats & Maisonettes post 1974': 'Flat',
'Non traditional houses': 'House',
'Sheltered': None,
'Flats & Maisonettes 1965-1974': 'Flat',
'House post 1974': 'House',
'Block': None,
'Flats & Maisonettes 1946-1964': 'Flat',
'House 1965-1974': 'House',
'Non traditional flats': 'Flat',
'Bungalow 1965-1974': 'Bungalow',
'PIMSS EMPTY': None,
'Bungalow post 1974': 'Bungalow',
'Bungalow 1946-1964': 'Bungalow',
'Flats & Maisonettes 1919-1945': 'Flat',
'House pre 1919': 'House',
'Flats & Maisonettes pre 1919': 'Flat',
'Bungalow 1919-1945': 'Bungalow',
'Office': None
},
"HA42": {
'Flat': 'Flat',
'House': 'House',
'Flat Basement': 'Flat',
'Room': None,
'Bedsit Flat': 'Flat',
'Maisonette': 'Maisonette',
'Scheme Office': None,
'Scheme Lounge': None,
'Bungalow': 'Bungalow',
'Garage': None,
'Scheme Sleep Room': None,
'Cluster': None,
'Scheme Room': None
},
"HA45": {
'Large block of flats': 'Flat',
'Small block of flats/dwelling converted in to flats': 'Flat',
'Semi-detached house': 'House',
'Mid-terraced house': 'House',
'End-terraced house': 'House',
'Block of flats': 'Flat',
'Detached house': 'House',
'Flat in mixed use building': 'Flat',
},
"HA48": {
"House": "House",
"Flat": "Flat",
"Bungalow": "Bungalow",
"Maisonette": "Maisonette",
"Unit": None
},
"HA50": {
'House': 'House',
'Bungalow': 'Bungalow',
'Flat': 'Flat',
'House SD': 'House',
'House MT': 'House',
'House ET': 'House',
'Bungalow ET': 'Bungalow',
'House SD ': 'House',
'House. SD': 'House',
'Bungalow SD': 'Bungalow',
'Bungalow MT': 'Bungalow',
'Bungalow D': 'Bungalow',
'House D': 'House',
'House. MT': 'House',
'House ': 'House',
'House ET ': 'House',
' ': None,
'Flat?': 'Flat',
'Bungalow ': 'Bungalow'
},
"HA51": {
'FLAT': 'Flat',
'HOUSE': 'House',
'MAISONETTE': 'Maisonette',
'BEDSIT': None, # Considering as a non-specific residential category here
'BUNGALOW': 'Bungalow',
},
"HA52": {
'House - Mid Terrace': 'House',
'Flat - First Floor': 'Flat',
'Flat - Ground Floor': 'Flat',
'House - Semi-Detached': 'House',
'House - End Terrace': 'House',
'Flat - Second Floor': 'Flat',
'Bedsit': None, # Considering as a non-specific residential category here
'Bungalow - Semi-Detached': 'Bungalow',
'Bungalow - Mid Terrace': 'Bungalow',
'Bungalow - End Terrace': 'Bungalow',
'House - Detached': 'House',
'Flat - Third Floor': 'Flat',
'House attached to flats': 'House',
'Flat - Fourth Floor': 'Flat',
'Bungalow - Detached': 'Bungalow'
},
"HA56": {
'House Non Specific': 'House',
'HOUSE TERRACED': 'House',
'HOUSE - SEMI DETACHD': 'House',
'Bungalow': 'Bungalow',
'House - End Terraced': 'House',
'Block': None,
'Block with Communal': None,
'Bungalow - Terraced': 'Bungalow',
'Bungalow - Semi Dtch': 'Bungalow',
'Block House with rooms': None,
'Bungalow - End Terr': 'Bungalow',
'House - Mid Terraced': 'House',
'Bungalow - Detached': 'Bungalow',
'House - Detached': 'House',
'HOUSE THREE STOREY': 'House',
'Maisonette': 'Maisonette',
'Communal Block': None,
'Scheme': None
},
"HA63": {
'Flat': 'Flat',
'House - Semi detached': 'House',
'House - Detached': 'House',
'House - End Terrace': 'House',
'House - Mid Terrace': 'House',
'Bungalow - Semi detached': 'Bungalow',
'Bungalow': 'Bungalow',
'Bedsit': None, # Considering as a non-specific residential category here
'Maisonette': 'Maisonette',
'Bungalow - End Terrace': 'Bungalow',
'Bungalow - Detached': 'Bungalow',
'Maisonette - Mid Terrace': 'Maisonette',
'Maisonette - End Terrace': 'Maisonette',
'Studio Flat': 'Flat',
'Maisonette - Detached': 'Maisonette',
'Bungalow - Mid Terrace': 'Bungalow',
'Bedsit - Mid Terrace': None,
'Bedsit - End Terrace': None,
'Amenity Block - Semi detached': None, # Assuming non-residential
'Maisonette - Semi Detached': 'Maisonette',
'Amenity Block - Detached': None, # Assuming non-residential
'Hostel': None, # Typically not considered a standard residential property for this context
'Bungalow - Attached': 'Bungalow',
'Unknown': None, # Not enough information to categorize
'Studio Flat - Mid Terrace': 'Flat',
'Chalet - Wheelchair': None # Specialized type, not categorized here
},
"HA107": {
"property_type": {
"HOUSE": "House",
"BUNGALOW": "Bungalow",
"GRD FLOOR FLAT": "Flat",
"FIRST FLOOR FLAT": "Flat",
"SHELTERED BUNGALOW": "Bungalow",
"MAISONETTE": "Maisonette",
"SECOND FLOOR FLAT": "Flat",
"SHELTERED FIRST FLR": "Flat",
"SHELTERED GROUND FLR": "Flat",
"GRD FLOOR BED SIT": "House"
},
"built_form": {
"Semi Detached": "Semi-Detached",
"Mid Terrace": "Mid-Terrace",
"End Terrace": "End-Terrace",
"Detached": "Detached",
"Detatched": "Detached",
}
},
"HA117": {
"Flat": "Flat",
"House": "House",
"Bungalow": "Bungalow",
"Flat over garage/underpass": "Flat",
},
"HAXXX": {
'mid terraced house': 'House',
'semi detached house': 'House',
'1st fl 4 in a block': 'Flat',
'G/F 4 in a block': 'Flat',
'end terraced house': 'House',
'1st floor flat': 'Flat',
'G/F floor flat': 'Flat',
'semi detached bungalow': 'Bungalow',
'2nd floor flat': 'Flat',
'mid terrace bungalow': 'Bungalow',
'detached bungalow': 'Bungalow',
'end terrace bungalow': 'Bungalow',
'Staff accommodation': None # Marked as None due to its special nature
}
}
class DataLoader:
COLUMN_CONFIG = {
"HA1": {
"address": "Address",
"postcode": "Address - Postcode"
},
"HA5": {
"address": "Address",
"postcode": "matching_postcode"
},
"HA6": {
"address": "propertyaddress",
"postcode": "address" # The 'address' column actually contains postcode
},
"HA12": {
"address": "Full Address",
"postcode": "Postcode"
},
"HA16": {
"address": "Address",
"postcode": "Postcode"
},
"HA24": {
"address": "Address",
"postcode": "Postcode"
},
"HA25": {
"address": "T1_Address",
"postcode": "matching_postcode"
},
"HA30": {
"address": "A_Address",
"postcode": "A_Postcode"
},
"HA31": {
"address": "A_Address",
"postcode": "matching_postcode"
},
"HA45": {
"address": "Full postal address",
"postcode": "Postcode"
},
"HA48": {
"address": "Full Address",
"postcode": "Postcode"
},
"HA49": {
"address": "Property Address Full",
"postcode": "Property Postcode"
},
"HA52": {
"address": "Postal Address",
"postcode": "POSTCODE"
},
"HA54": {
"address": "Postal Address",
"postcode": "matching_postcode"
}
}
UNMATCHED_CIGA = {
"HA2": 0,
"HA6": 117,
"HA9": 0,
"HA12": 6,
"HA13": 119,
"HA14": 3,
"HA15": 3,
"HA16": 7,
"HA24": 12,
"HA50": 4,
"HA63": 15,
"HA107": 51,
"HA48": 0,
"HA45": 0,
"HA52": 5,
"HA20": 6
}
UNMATCHED_ECO3 = {
"HA25": 154,
"HA41": 26,
"HA50": 5,
"HA56": 320,
"HA63": 0,
"HA117": 4,
"HA51": 24
}
def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
self.directories = directories
self.use_cache = use_cache
self.december_figures_filepath = december_figures_filepath
self.rebuild = rebuild
self.data = {}
self.december_figures = None
self.facts_and_figures = None
def create_asset_list_matching_address(self, ha_name, asset_list):
if ha_name in [
"HA1", "HA5", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA52", "HA54"
]:
asset_list["matching_address"] = asset_list[
self.COLUMN_CONFIG[ha_name]["address"]
].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list[
self.COLUMN_CONFIG[ha_name]["postcode"]
].astype(str).str.lower().str.strip()
elif ha_name == "HA2":
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA7":
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA8":
asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA9":
asset_list["matching_address"] = asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA11":
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Post Code"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
elif ha_name == "HA13":
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["address 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA14":
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 4"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA15":
asset_list["matching_address"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " +
asset_list["Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA18":
asset_list["matching_address"] = (
asset_list["Address"].astype(str).str.lower().str.strip() + ", " +
asset_list["Post Code"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
elif ha_name == "HA19":
asset_list["matching_address"] = (
asset_list["Address1"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address2"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address3"].astype(str).str.lower().str.strip() + ", " +
asset_list["Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA20":
asset_list["matching_address"] = (
asset_list["House Name"].astype(str).str.lower().str.strip() + ", " +
asset_list["Block"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " +
asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " +
asset_list["Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA21":
asset_list["matching_address"] = (
asset_list["Address"].astype(str).str.lower().str.strip() + ", " +
asset_list["PostCode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
elif ha_name == "HA25":
asset_list["matching_address"] = asset_list[
self.COLUMN_CONFIG[ha_name]["address"]
].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list['matching_address'].apply(
lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x
)
elif ha_name == "HA27":
asset_list["matching_address"] = (
asset_list[" Address"].astype(str).str.lower().str.strip() + ", " +
asset_list[" Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA28":
asset_list["matching_address"] = (
asset_list["House Number"].astype(str).str.lower().str.strip() + ", " +
asset_list["Street 1"].astype(str).str.lower().str.strip() + ", " +
asset_list["Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA32":
asset_list["matching_address"] = (
asset_list["Dwelling num"].astype(str).str.lower().str.strip() + ", " +
asset_list["Street"].astype(str).str.lower().str.strip() + ", " +
asset_list["Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA33":
asset_list["matching_address"] = (
asset_list["ADDRESS"].astype(str).str.lower().str.strip() + ", " +
asset_list["POST CODE"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["POST CODE"].astype(str).str.lower().str.strip()
elif ha_name == "HA34":
asset_list["matching_address"] = (
asset_list[" Address"].astype(str).str.lower().str.strip() + ", " +
asset_list[" Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA35":
asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Post Code"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Address Post Code"].astype(str).str.lower().str.strip()
elif ha_name == "HA37":
asset_list["matching_address"] = asset_list["ADDRESS LINE 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["ADDRESS LINE 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["ADDRESS LINE 3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["POSTCODE"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
elif ha_name == "HA38":
asset_list["matching_address"] = asset_list["House_Number"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address_Line_1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address_Line_2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address_Line_3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA39":
# Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["add_2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["add_3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \
asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \
asset_list["post_code"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["post_code"].astype(str).str.lower().str.strip()
elif ha_name == "HA41":
asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["AddressLine3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["AddressLine4"].astype(str).str.lower().str.strip() + ", " + \
asset_list["AddressLine5"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA42":
asset_list["matching_address"] = asset_list["Dwelling Number"].astype(str).str.lower().str.strip() + " " + \
asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Locality"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA44":
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postal Code"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postal Code"].astype(str).str.lower().str.strip()
elif ha_name == "HA50":
asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Post Code"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
elif ha_name == "HA51":
asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_address"] = np.where(
asset_list["Block"].str.strip().str.len() > 0,
asset_list["Block"].astype(str).str.lower().str.strip() + ", " + \
asset_list["matching_address"],
asset_list["matching_address"]
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA56":
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Post Code"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
elif ha_name == "HA63":
asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["POSTCODE"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
elif ha_name == "HA70":
asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["POSTCODE"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
elif ha_name == "HA107":
# Create matching_address by concatenating House No, Street, Town, District, Postcode
asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \
asset_list["District"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA117":
asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["PostCode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
elif ha_name == "HAXX":
asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
asset_list["PostCode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
elif ha_name == "HAXXX":
asset_list["matching_address"] = (
asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " +
asset_list["Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
else:
raise NotImplementedError("implement me")
return asset_list
@staticmethod
def extract_property_info_ha107(properties):
property_types = {
"House": "House",
"Flat": "Flat",
"Bungalow": "Bungalow",
"Maisonette": "Maisonette",
"Bedsit": None
}
built_forms = {
"Detached": "Detached",
"Semi Detached": "Semi-Detached",
"End Terrace": "End-Terrace",
"Mid Terrace": "Mid-Terrace"
}
# Function to extract property type and built form from a description
def extract_from_description(description):
property_type = None
built_form = None
for key in property_types:
if key in description:
property_type = property_types[key]
break
for key in built_forms:
if key in description:
built_form = built_forms[key]
break
return property_type, built_form
# Process each property in the list
results = []
for property_description in properties:
property_type, built_form = extract_from_description(property_description)
results.append(
{
"Property type": property_description,
"property_type": property_type,
"built_form": built_form
}
)
results = pd.DataFrame(results)
return results
def append_asset_list_built_form(self, ha_name, asset_list):
# Finally, we process property_type or built form, where needed
if ha_name == "HA6":
asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6)
if ha_name == "HA107":
mapped_df = self.extract_property_info_ha107(asset_list["Property type"].unique())
asset_list = asset_list.merge(
mapped_df, how="left", on="Property type"
)
return asset_list
@staticmethod
def create_asset_list_house_no(ha_name, asset_list):
"""
This function will append the House number onto the asset list
:return:
"""
if ha_name == "HA107":
asset_list["HouseNo"] = asset_list["House No"].copy()
elif ha_name == "HA32":
asset_list["HouseNo"] = asset_list["Dwelling num"].copy()
elif ha_name == "HA28":
asset_list["HouseNo"] = asset_list["House Number"].copy()
elif ha_name == "HA38":
asset_list["HouseNo"] = asset_list["House_Number"].copy()
elif ha_name == "HA9":
asset_list["HouseNo"] = asset_list["House Number"].copy()
elif ha_name == "HAXXX":
asset_list["HouseNo"] = asset_list["Door Number"].copy()
else:
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
# If we have "flat" or valley" as the house number, then the house number is actually in the second column
house_numbers[0] = np.where(house_numbers[0].isin(["flat", "valley"]), house_numbers[1], house_numbers[0])
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
# many columns there might be
house_numbers = house_numbers.iloc[:, 0:1]
house_numbers.columns = ['HouseNo']
# Remove trailing punctuation such as , or ;
house_numbers["HouseNo"] = house_numbers["HouseNo"].str.rstrip(',;')
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
return asset_list
@staticmethod
def create_ciga_list_house_no(ciga_list):
"""
This function will append the House number onto the asset list
:return:
"""
split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
# many columns there might be
house_numbers = house_numbers.iloc[:, 0:1]
house_numbers.columns = ['HouseNo']
ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1)
return ciga_list
@staticmethod
def dedupe_ciga_list(ciga_list):
ciga_list["unique_key"] = ciga_list["Matched Address"] + ciga_list["Matched Postcode"]
# Remove spaces from the unique key
ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(" ", "")
# Remove punctuation from the unique key
ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(r'[^\w\s]', '')
# Drop duplicated keys
ciga_list = ciga_list[~ciga_list["unique_key"].duplicated()]
return ciga_list
@staticmethod
def get_asset_sheetname(workbook):
if "Asset List" in workbook.sheetnames:
return "Asset List"
elif "Asset list" in workbook.sheetnames:
return "Asset list"
elif "Asset" in workbook.sheetnames and "Assets" not in workbook.sheetnames:
return "Asset"
elif "Decent Homes Stock" in workbook.sheetnames:
return "Decent Homes Stock"
elif "Report" in workbook.sheetnames:
return "Report"
else:
return "Assets"
@staticmethod
def get_ciga_sheetname(workbook):
if "CIGA Checks" in workbook.sheetnames:
return "CIGA Checks"
elif "CIGA checks" in workbook.sheetnames:
return "CIGA checks"
elif "CIGA check" in workbook.sheetnames:
return "CIGA check"
elif "CIGA Check" in workbook.sheetnames:
return "CIGA Check"
elif "CIGA requested" in workbook.sheetnames:
return "CIGA requested"
else:
return "CIGA"
@staticmethod
def get_survey_sheetname(workbook):
if "ECO Surveys" in workbook.sheetnames:
return "ECO Surveys"
elif "ECO Survey" in workbook.sheetnames:
return "ECO Survey"
elif "ECO 4 Surveys completed" in workbook.sheetnames:
return "ECO 4 Surveys completed"
elif "ECO4 Surveys" in workbook.sheetnames:
return "ECO4 Surveys"
else:
return "ECO surveys"
@staticmethod
def correct_ha51_asset_list(asset_list):
# Correct this
asset_list["HouseNo"] = np.where(
asset_list["matching_address"].str.contains("61 wandle bank"),
asset_list["Block"].str.lower(),
asset_list["HouseNo"]
)
return asset_list
def prepare_ha17(self, workbook):
blocks_sheet = workbook["Blocks List - Cavity Wall only"]
blocks_data = []
blocks_colnames = [cell.value for cell in blocks_sheet[2]]
for row in blocks_sheet.iter_rows(min_row=4, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
blocks_data.append(row_data)
blocks_df = pd.DataFrame(blocks_data, columns=blocks_colnames)
blocks_df["matching_address"] = (
blocks_df["Block Name\n[as per Naming Convention procedure]"].astype(str).str.lower().str.strip() + ", " +
blocks_df["Block Street Name"].astype(str).str.lower().str.strip() + ", " +
blocks_df["Postcode"].astype(str).str.lower().str.strip()
)
blocks_df["matching_postcode"] = blocks_df["Postcode"].astype(str).str.lower().str.strip()
blocks_df["property_type"] = "Flat"
street_properties_sheet = workbook["Street Properties - Cavity Wall"]
street_properties_data = []
street_properties_colnames = [cell.value for cell in street_properties_sheet[2]]
for row in street_properties_sheet.iter_rows(min_row=3, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
street_properties_data.append(row_data)
street_properties_df = pd.DataFrame(street_properties_data, columns=street_properties_colnames)
street_properties_df["matching_address"] = (
street_properties_df["Block Name\n[as per Naming Convention procedure]"].astype(
str).str.lower().str.strip() + ", " +
street_properties_df["Postcode"].astype(str).str.lower().str.strip()
)
street_properties_df["matching_postcode"] = street_properties_df["Postcode"].astype(str).str.lower().str.strip()
street_properties_df["property_type"] = street_properties_df[
"Block typology based on dwelling type\n[defined list]"
]
asset_list_compressed = pd.concat(
[
blocks_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]],
street_properties_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]]
],
axis=0
)
# We expand
range_pattern = r"(\d+)\s+to\s+(\d+)\s+(.*)"
asset_list = []
for _, row in tqdm(asset_list_compressed.iterrows(), total=len(asset_list_compressed)):
if row["ECO Eligibility"] == "Not Eligible":
asset_list.append(row.to_dict())
continue
# Detect a house number range
match = re.search(range_pattern, row["matching_address"])
if not match:
asset_list.append(row.to_dict())
continue
# Extracting the start and end of the range
start_number = int(match.group(1))
end_number = int(match.group(2))
rest_of_address = match.group(3)
# Generating the list of house numbers
house_numbers = list(range(start_number, end_number + 1))
data_to_extend = []
for house_number in house_numbers:
new_adress = f"{house_number} {rest_of_address}"
entry = row.to_dict().copy()
entry.update({"matching_address": new_adress})
data_to_extend.append(entry)
asset_list.extend(data_to_extend)
asset_list = pd.DataFrame(asset_list)
# Add in asset_list_row_id
asset_list["asset_list_row_id"] = ["HA17" + str(i) for i in range(0, len(asset_list))]
# Add on house number
asset_list = self.create_asset_list_house_no(ha_name="HA17", asset_list=asset_list)
return asset_list
def load_asset_list(self, filepath, ha_name):
workbook = openpyxl.load_workbook(filepath)
if ha_name == "HA17":
asset_list = self.prepare_ha17(workbook)
return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
else:
asset_sheetname = self.get_asset_sheetname(workbook)
asset_sheet = workbook[asset_sheetname]
asset_sheet_colnames = [cell.value for cell in asset_sheet[1]]
if ha_name == "HA25":
asset_sheet_colnames[11] = "matching_postcode"
if ha_name == "HA31":
asset_sheet_colnames[2] = "matching_postcode"
if ha_name == "HA54":
asset_sheet_colnames[10] = "matching_postcode"
if ha_name == "HA5":
asset_sheet_colnames[2] = "matching_postcode"
rows_data = []
for row in asset_sheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
rows_data.append(row_data)
asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames)
asset_list = asset_list.loc[:, asset_list.columns.notnull()]
# Remove entirely empty rows - consider all rows apart from row_color
asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)]
# Add in asset_list_row_id
asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))]
# Create matching address and matching postcode
asset_list = self.create_asset_list_matching_address(ha_name=ha_name, asset_list=asset_list)
asset_list = self.create_asset_list_house_no(ha_name=ha_name, asset_list=asset_list)
asset_list = self.append_asset_list_built_form(ha_name=ha_name, asset_list=asset_list)
# We correct the asset list if it needs it
# Correct the asset list
correction_function_name = f"correct_{ha_name.lower()}_asset_list"
if hasattr(self, correction_function_name):
asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
asset_list = asset_list_correction_function(asset_list)
# For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga
# lists, and so
# we can return the asset list now
if ha_name in ["HA1", "HA27"]:
return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
# If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be
# suitable under ECO4, since their walls will be filled
eco3_list = pd.DataFrame()
sheetnames_lower = [x.lower() for x in workbook.sheetnames]
eco3_sheetname_index = [i for i, x in enumerate(sheetnames_lower) if "eco3" in x.replace(" ", "")]
if eco3_sheetname_index:
eco3_sheetname = workbook.sheetnames[eco3_sheetname_index[0]]
eco3_sheet = workbook[eco3_sheetname]
eco3_rows = []
for row in eco3_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
eco3_rows.append(row_data)
eco3_list = pd.DataFrame(eco3_rows, columns=[cell.value for cell in eco3_sheet[1]])
# Remove columns that are None
eco3_list = eco3_list.loc[:, eco3_list.columns.notnull()]
# Remove rows that are completely empty
eco3_list = eco3_list.loc[eco3_list.loc[:, eco3_list.columns].notnull().any(axis=1)]
eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))]
# Perform the eco3 merge
if not eco3_list.empty:
eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name)
if ha_name in ["HA25"]:
# Accomodate ha25 unique structure
return asset_list, pd.DataFrame(), pd.DataFrame(), eco3_list
# We check if there is a survey list
survey_sheetname = self.get_survey_sheetname(workbook)
survey_sheet = workbook[survey_sheetname]
survey_rows = []
for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
survey_rows.append(row_data)
survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
# Remove columns that are None
survey_list = survey_list.loc[:, survey_list.columns.notnull()]
# Remove rows that are completely empty
survey_list = survey_list.loc[survey_list.loc[:, survey_list.columns].notnull().any(axis=1)]
survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))]
# Perform survey list merge
if not survey_list.empty:
survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name)
# We check if there are CIGA checks
ciga_sheetname = self.get_ciga_sheetname(workbook)
ciga_sheet = workbook[ciga_sheetname]
ciga_rows = []
for row in ciga_sheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
ciga_rows.append(row_data)
ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
# Remove columns that are None
ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
# Remove rows that are completely None
ciga_list = ciga_list.loc[ciga_list.loc[:, ciga_list.columns].notnull().any(axis=1)]
# Perform ciga list merge
if not ciga_list.empty:
# Remove rows with missing postcode which happens in a small number of cases
ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
ciga_list = self.create_ciga_list_house_no(ciga_list)
ciga_list = self.dedupe_ciga_list(ciga_list)
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
return asset_list, survey_list, ciga_list, eco3_list
@staticmethod
def correct_ha6_asset_list(asset_list):
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
asset_list["matching_address"] = asset_list["matching_address"].str.replace("baggott place", "baggotts place")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
asset_list["matching_address"] = asset_list["matching_address"].str.replace("cherry tree", "cherrytree")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
asset_list["matching_address"] = asset_list["matching_address"].str.replace("maryhill close", "mary hill close")
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
asset_list["matching_address"] = asset_list["matching_address"].str.replace("moffat way", "moffatt way")
return asset_list
@staticmethod
def correct_ha56_asset_list(asset_list):
# CH1 4JR has already been surveyed, but it's listed in the asset list
# as a single row, when it's actually 32 units, so we just set this
# as ineligible
asset_list["ECO Eligibility"] = np.where(
asset_list["Post Code"] == "CH1 4JR",
"Not eligible",
asset_list["ECO Eligibility"]
)
# Same for CW8 3EU
asset_list["ECO Eligibility"] = np.where(
asset_list["Post Code"] == "CW8 3EU",
"Not eligible",
asset_list["ECO Eligibility"]
)
asset_list["ECO Eligibility"] = np.where(
asset_list["Post Code"] == "CW1 3HP",
"Not eligible",
asset_list["ECO Eligibility"]
)
asset_list["ECO Eligibility"] = np.where(
asset_list["Post Code"] == "WA4 2PH",
"Not eligible",
asset_list["ECO Eligibility"]
)
asset_list["ECO Eligibility"] = np.where(
asset_list["Post Code"] == "BD6 1QJ",
"Not eligible",
asset_list["ECO Eligibility"]
)
asset_list["ECO Eligibility"] = np.where(
asset_list["Post Code"] == "L39 1RS",
"Not eligible",
asset_list["ECO Eligibility"]
)
asset_list["ECO Eligibility"] = np.where(
asset_list["Post Code"] == "WA10 2DE",
"Not eligible",
asset_list["ECO Eligibility"]
)
# Already surveyed under ECO4
asset_list["ECO Eligibility"] = np.where(
asset_list["Post Code"] == "SK17 6NR",
"Not eligible",
asset_list["ECO Eligibility"]
)
asset_list["ECO Eligibility"] = np.where(
((asset_list["Post Code"] == "WA5 0EN") &
(asset_list["Address 1"] == "Block 17-26 Tavlin Avenue")),
"Not eligible",
asset_list["ECO Eligibility"]
)
return asset_list
@staticmethod
def correct_ha14_asset_list(asset_list):
# For 5 Queens Court, DE72 3NP, the postcode is actually DE72 3QZ
asset_list.loc[
(asset_list["Address 1"] == "5 Queens Court") &
(asset_list["Postcode"].str.strip() == "DE72 3NP"),
"matching_postcode"
] = "DE72 3QZ"
# We then correct the matching_address
asset_list.loc[
(asset_list["Address 1"] == "5 Queens Court") &
(asset_list["Postcode"].str.strip() == "DE72 3NP"),
"matching_address"
] = "5 queens court, garfield avenue, draycott, derby, de72 3qz"
return asset_list
@staticmethod
def correct_ha15_asset_list(asset_list):
asset_list["matching_postcode"] = np.where(
asset_list["Address Line 1"] == "103 Priory Crescent",
"hp19 9ny",
asset_list["matching_postcode"]
)
return asset_list
@staticmethod
def correct_ha32_asset_list(asset_list):
asset_list["Postcode"] = np.where(
(asset_list["Street"] == "Norton Grove") & (asset_list["Postcode"] == "HU4 6HQ") & (
asset_list["Dwelling num"] == "7"),
"hu4 6hg",
asset_list["Postcode"]
)
return asset_list
@staticmethod
def correct_ha38_asset_list(asset_list):
# For Kingsford court, the house number is at the end of the address
def rearrange_address_if_flat(address):
if '/flat' in address.lower():
parts = address.split('/flat', 1)
return f"FLAT{parts[1]}, {parts[0]}"
return address
def extract_house_no_if_flat(address):
if '/flat' in address.lower():
# Attempt to extract the house number following "/flat"
try:
house_no = address.split('/flat ')[1].split(' ')[0]
# Remove trailing comma
house_no = house_no.replace(",", "")
except IndexError:
house_no = None
return house_no
return None
asset_list['ExtractedHouseNo'] = asset_list['matching_address'].apply(extract_house_no_if_flat)
asset_list.loc[asset_list['ExtractedHouseNo'].notnull(), 'HouseNo'] = asset_list['ExtractedHouseNo']
asset_list['matching_address'] = asset_list['matching_address'].apply(rearrange_address_if_flat)
# We update a few specific rows
asset_list["HouseNo"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/ROOM A1",
"10 SOUTH VIEW/ROOM A2",
"10 SOUTH VIEW/ROOM A3",
]
)),
"10A",
asset_list["HouseNo"]
)
asset_list["matching_address"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/ROOM A1",
]
)),
"10a, 10 south view/room a1, spennymoor, co. durham, dl16 7df'",
asset_list["matching_address"]
)
asset_list["HouseNo"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/ROOM B1",
"10 SOUTH VIEW/ROOM B2",
"10 SOUTH VIEW/ROOM B3",
"10 SOUTH VIEW/ROOM B4",
]
)),
"10B",
asset_list["HouseNo"]
)
asset_list["matching_address"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/ROOM B1",
]
)),
"10b, 10 south view/room b1, spennymoor, co. durham, dl16 7df",
asset_list["matching_address"]
)
asset_list["HouseNo"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/FLAT C",
]
)),
"10C",
asset_list["HouseNo"]
)
asset_list["matching_address"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/FLAT C",
]
)),
"FLAT c, spennymoor, co. durham, dl16 7df, 10c, 10 south view",
asset_list["matching_address"]
)
asset_list["HouseNo"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/FLAT D",
]
)),
"10D",
asset_list["HouseNo"]
)
asset_list["matching_address"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/FLAT D",
]
)),
"FLAT d, spennymoor, co. durham, dl16 7df, 10d, 10 south view",
asset_list["matching_address"]
)
asset_list["HouseNo"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/FLAT E",
]
)),
"10E",
asset_list["HouseNo"]
)
asset_list["matching_address"] = np.where(
(asset_list["Address_Line_1"].isin(
[
"10 SOUTH VIEW/FLAT E",
]
)),
'FLAT e, spennymoor, co. durham, dl16 7df, 10e, 10 south view',
asset_list["matching_address"]
)
return asset_list
@staticmethod
def correct_ha6_survey_list(survey_list):
# Correct the survey list
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Seabridge Road", "Seabridge Lane"
)
# Strip out /KNUTTON from the street name
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/KNUTTON", "")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Clevend Road", "Cleveland Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"TURNERS AVENUE", "Turner Avenue"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"WEDGEWWOD AVENUE", "Wedgwood Avenue"
)
# The cherrytree record has wrong postcode
survey_list.loc[survey_list["Street / Block Name"] == "Cherrytree road", "Post Code"] = "ST5 7BP"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"MONUMENT RD", "Monument Road"
)
# Generally replace " RD" with " Road"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" RD", " Road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"HILARY Road", "Hillary Road"
)
# Remove full stops from the street name
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(".", "")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Chatworth road", "Chatsworth Place"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Wood Croft", "Woodcroft"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Milstone Avenue", "Millstone Avenue"
)
# Strip out /TALKE from the street name
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/TALKE", "")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Woodcutts Street", "Woodshutts Street"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"HILLARY AVENUE", "Hillary Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"HILLARY AVENUE", "Hillary Road"
)
# Replace " Rd" with " Road"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" Rd", " Road")
# We have a record listed as 19, MAPLE AVENUE ST7 1JX, when it should be 19, Hollins Crescent ST7 1JX
survey_list.loc[
(survey_list["Street / Block Name"] == "MAPLE AVENUE") &
(survey_list["NO."].isin([19])) &
(survey_list["Post Code"] == "ST7 1JX"),
"Street / Block Name"
] = "Hollins Crescent"
# However, some of the maple avenue records, are indeed Maple avenue, but are listed with the wrong postcode.
# E.g. number 26
survey_list.loc[
(survey_list["Street / Block Name"] == "MAPLE AVENUE") &
(survey_list["NO."].isin([26])) &
(survey_list["Post Code"] == "ST7 1JX"),
"Post Code"
] = "ST7 1JW"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"BURSLEY Road", "Bursley Way"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Brittania Avenue", "Brittain Avenue"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Hawthorn Road", "Hawthorne Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Eastdale Place", "Easdale Place"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Wedgewood Road", "Wedgwood Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Droitwich Drive", "Droitwich Close"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Longdale Road", "Langdale Road"
)
# We have 2 addresses in the survey list that don't have postcodes. We'll manually add them in
survey_list.loc[
(survey_list["Street / Block Name"] == "Rogers Avenue") &
pd.isnull(survey_list["Post Code"]),
"Post Code"
] = "ST5 9AT"
survey_list.loc[
(survey_list["Street / Block Name"] == "Cedar Road") &
pd.isnull(survey_list["Post Code"]),
"Post Code"
] = "ST5 7BY"
# PERFORM ADDITIONAL DROPS
# Dropping rows based on multiple conditions
conditions_to_drop = [
(survey_list['Street / Block Name'] == "Bedford Crescent") & (survey_list['Post Code'] == "ST5 3EH") & (
survey_list['NO.'] == 23) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
(survey_list['Street / Block Name'] == "Hereford Avenue") & (survey_list['Post Code'] == "ST5 3EJ") & (
survey_list['NO.'] == 92) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
(survey_list['Street / Block Name'] == "Seabridge Lane") & (survey_list['Post Code'] == "ST5 3EX") & (
survey_list['NO.'].isin([16, 18, 42])) & (
survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
(survey_list['Street / Block Name'] == "ESKDALE PLACE") & (survey_list['Post Code'] == "ST5 3QW") & (
survey_list['NO.'] == 5) & (survey_list['SUBMISSION DATE'].astype(str) == "2023-03-06 00:00:00"),
(survey_list['Street / Block Name'] == "Birch House road") & (survey_list['Post Code'] == "ST6 2LS") & (
survey_list['NO.'].isin([56, 58])),
(survey_list['Street / Block Name'] == "Blackthorn Place") & (survey_list['Post Code'] == "ST6 2LS") & (
survey_list['NO.'].isin([37, 39])),
(survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 7BT") & (
survey_list['NO.'].isin([17, 6])),
(survey_list['Street / Block Name'] == "Lion Grove") & (survey_list['Post Code'] == "ST5 7HQ") & (
survey_list['NO.'].isin([10, 12])) & (
survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
(survey_list['Street / Block Name'] == "DENRY CRESCENT") & (survey_list['Post Code'] == "ST5 8JW") & (
survey_list['NO.'] == 87) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
(survey_list['Street / Block Name'] == "HOLLINS CRESCENT") & (survey_list['Post Code'] == "ST7 1JW") & (
survey_list['NO.'] == 19)
]
# Combine all conditions with an OR "|"
combined_condition = np.logical_or.reduce(conditions_to_drop)
# Drop rows that meet the combined condition
survey_list = survey_list[~combined_condition]
# Making replacements using np.where
survey_list['Post Code'] = np.where(
(survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3EH") & (
survey_list['NO.'] == 17),
"ST5 7BT",
survey_list['Post Code']
)
survey_list['Post Code'] = np.where(
(survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3ED") & (
survey_list['NO.'] == 6),
"ST5 7BT",
survey_list['Post Code']
)
# Maple avenue (stoke on trent, not newcastle) should be st7 1jw
survey_list["Post Code"] = np.where(
(survey_list["Street / Block Name"].str.lower().str.contains("maple avenue")) & (
survey_list["Post Code"].str.lower() == "st7 1jx"
),
"st7 1jw",
survey_list["Post Code"]
)
# Hollins Crescent should be st7 1jx
survey_list["Post Code"] = np.where(
(survey_list["Street / Block Name"].str.lower().str.contains("hollins crescent")) & (
survey_list["Post Code"].str.lower() == "st7 1jw"
),
"st7 1jx",
survey_list["Post Code"]
)
# Additional drops as the above misses some:
survey_list = survey_list[
~((survey_list["NO."].astype(str).isin(["18", "42"])) &
(survey_list["Street / Block Name"] == "Seabridge Lane") &
(survey_list["Post Code"] == "ST5 3EY") &
(survey_list["SUBMISSION DATE"].astype(str) == "24.07.2023") &
(survey_list["INSTALLED OR CANCELLED"].str.contains("NO UPDATE YET")))
]
return survey_list
@staticmethod
def correct_ha14_survey_list(survey_list):
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Godfrey Road", "Godfrey Drive"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Oiliver Road", "Oliver Road"
)
# For postodes DE7 4FB, DE7 4EZ, it's actually spelled WINDERMERE AVENUE, not WINDEREMERE AVENUE (without the
# extra e)
survey_list.loc[
(survey_list["Street / Block Name"] == "WINDEREMERE AVENUE") &
(survey_list["Post Code"].isin(["DE7 4FB", "DE7 4EZ"])),
"Street / Block Name"
] = "WINDERMERE AVENUE"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"MACDONALD SQAURE", "MACDONALD SQUARE"
)
return survey_list
@staticmethod
def correct_ha15_survey_list(survey_list):
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive"
)
return survey_list
@staticmethod
def correct_ha16_survey_list(survey_list):
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == "REEDS RD",
"Reeds ROAD",
survey_list["Street / Block Name"]
)
# Replace " rd " with "road"
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\brd\b', 'road',
regex=True)
# Replace " , " with ", "
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(
" , ", ', ',
)
# Fix "{place} ,{place}" with "{place}, {place}"
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\s*,\s*', ', ',
regex=True)
# Strip whitespace
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip()
# Correct errors
survey_list["Post Code"] = np.where(
survey_list["Post Code"] == "M38 0SA",
"M38 9SA",
survey_list["Post Code"]
)
survey_list["Post Code"] = np.where(
(survey_list["Street / Block Name"] == "nelson drive") & (survey_list["Post Code"] == "M44 5JE"),
"M44 5JF",
survey_list["Post Code"]
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road",
"chatley road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road",
"plantation avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("how clough drive",
"howclough drive")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane",
"brookhurst lane")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("biirch road",
"birch road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hadson road",
"hodson road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("harbonne avennue",
"narbonne avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"cumberland road, cadishead",
"cumberland avenue, cadishead")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("aston field drive",
"ashton field drive")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wedgewood road",
"wedgwood road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hamilton close",
"hamilton avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"lichens crescent, fitton hill",
"lichens crescent")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill",
"south croft")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(", fitton hill", "")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("firtree dr",
"fir tree avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hawthorne road",
"hawthorn crescent")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue",
"reins lee avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("westerhill road",
"wester hill road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("st martins road",
"saint martins road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("timperley avenue",
"timperley close")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eastwood road",
"eastwood avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("new road", "new street")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grassmere road",
"grasmere road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hulton road",
"hulton avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("beechfield avenue",
"beechfield road")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue",
"princes avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent",
"edge fold crescent")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("conniston avenue",
"coniston avenue")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent",
"blackthorn crescent")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road",
"wellstock lane")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brackley avenue",
"brackley street")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brook avenue swinton",
"brook avenue, swinton")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("green avenue swinton",
"green avenue, swinton")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grasmere avenue wardley",
"grasmere avenue, wardley")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle",
"mardale avenue, wardle")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("carleach grove",
"cartleach Grove")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("arbour grove",
"arbor Grove")
# Replacement for clively avenue 66-68
survey_list["NO."] = np.where(
survey_list["NO."] == "66-68",
"66",
survey_list["NO."]
)
# Delete some duplicated entries
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "york road") &
(survey_list["NO."].astype(str) == "12") &
(survey_list["Post Code"] == "M44 5HU") &
(survey_list["SUBMISSION DATE"].astype(str) == "45229"))
]
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "peatfield avenue") &
(survey_list["NO."].astype(str) == "23") &
(survey_list["Post Code"] == "M27 9XG") &
(survey_list["SUBMISSION DATE"].astype(str) == "45236"))
]
return survey_list
@staticmethod
def correct_ha24_survey_list(survey_list):
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip()
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"council house, nidds lane", "nidds lane"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"wirral avenue", "wirrall avenue"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"st ives road", "st. ives crescent"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"sundringham road", "sandringham road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"milton avenue", "milton road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"st ives crescent", "st. ives crescent"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"council house, waterbelly lane", "waterbelly lane"
)
# Generally remove "councile house, " from the start of the street name
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"council house, ", ""
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"st. leodegars close", "st leodegars close"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"montgomery crescent", "montgomery road"
)
return survey_list
@staticmethod
def correct_ha28_survey_list(survey_list):
# Rename the "No" column to "No." to align with the other survey sheets
survey_list = survey_list.rename(columns={"NO ": "NO."})
survey_list["Post Code"] = np.where(
survey_list["Post Code"] == "ME75HA",
"ME7 5HA",
survey_list["Post Code"]
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ANDREW MANOR/BRITTON ST", "ANDREW MANOR"
)
survey_list["Post Code"] = np.where(
survey_list["Post Code"] == "ME75TW",
"ME7 5TW",
survey_list["Post Code"]
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ST MARKS HOUSE/SAXON ST", "ST MARKS HOUSE"
)
return survey_list
@staticmethod
def correct_ha38_survey_list(survey_list):
# Rename the "No" column to "No." to align with the other survey sheets
survey_list = survey_list.rename(columns={"NO ": "NO."})
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
'Kingsford Court, Coombe Valley Road', 'Kingsford Court'
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
'LESLIE TEW COURT/DERWENT ROAD', 'LESLIE TEW COURT'
)
# There is no 18A LESLIE TEW COURT in the asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "LESLIE TEW COURT") &
(survey_list["Post Code"] == "TN10 3TX") &
(survey_list["NO."] == "18A"))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
'Brindley House, Wellbeck Road', 'Brindley House'
)
# Try taking just the first part of the string, splitting on a /
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split('/').str[0].str.strip()
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
'HUNTSMAN WAY', 'HUNTSMANS WAY'
)
# Try taking just the first part of the string, splitting on a ,
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split(',').str[0].str.strip()
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"McCLAREN COURT", "MCLAREN COURT"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ST JAMES CLOISTERS", "ST. JAMES'S CLOISTERS"
)
survey_list["Street / Block Name"] = np.where(
((survey_list["NO."].isin(
[
"FLAT 1 22",
"FLAT 2 22",
"FLAT 3 22",
"FLAT 4 22",
"FLAT 5 22",
"FLAT 6 22",
]
)) &
(survey_list["Street / Block Name"] == "MELTON ROAD")),
"22 MELTON ROAD",
survey_list["Street / Block Name"]
)
survey_list["Street / Block Name"] = np.where(
((survey_list["NO."].isin(
[
"FLAT 1 24",
"FLAT 2 24",
"FLAT 3 24",
"FLAT 4 24",
"FLAT 5 24",
"FLAT 6 24",
]
)) &
(survey_list["Street / Block Name"] == "MELTON ROAD")),
"24 MELTON ROAD",
survey_list["Street / Block Name"]
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"TURRETT GREEN COURT SILENT STREET", "TURRET GREEN COURT"
)
# Turret green court flat 1 doesn't exist in the asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "TURRET GREEN COURT") &
(survey_list["NO."] == 1))
]
# 3, 45 raywell steet doesn't exist in the asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "45 RAYWELL STREET") &
(survey_list["NO."] == 3))
]
# 40 Avondale drive doesn't exist in the asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Avondale Drive") &
(survey_list["NO."] == 40))
]
# 17A beech road has the wrong postcode
survey_list["Post Code"] = np.where(
(survey_list["Street / Block Name"] == "BEECH ROAD") &
(survey_list["Post Code"] == "DH6 1JD"),
"DH6 1JB",
survey_list["Post Code"]
)
survey_list["Street / Block Name"] = np.where(
(survey_list["Street / Block Name"] == "SOUTHVIEW") &
(survey_list["Post Code"] == "DL16 7DF"),
"SOUTH VIEW",
survey_list["Street / Block Name"]
)
survey_list["Post Code"] = np.where(
(survey_list["Street / Block Name"] == "BEECH ROAD") &
(survey_list["Post Code"] == "DH6 1JD"),
"DH6 1JB",
survey_list["Post Code"]
)
return survey_list
@staticmethod
def correct_ha32_survey_list(survey_list):
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == "Coxwold",
"Coxwold Grove",
survey_list["Street / Block Name"]
)
# Update the Barringhton Avenue with their correct spelling: Barrington Avenue
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == "Barringhton Avenue",
"Barrington Avenue",
survey_list["Street / Block Name"]
)
# Update how the Rustenburn addresses are listed in the identified addresses
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == "Rustenburg",
"Rustenburg Street",
survey_list["Street / Block Name"]
)
# Update how the MALIN LODGE, RONALDSWAY CLOSE addresses are listed in the identified addresses
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == "MALIN LODGE, RONALDSWAY CLOSE",
"Malin Lodge",
survey_list["Street / Block Name"]
)
# Update how the Feroes Close are listed in the identified addresses
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == "Feroes Close",
"Faroes Close",
survey_list["Street / Block Name"]
)
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == 'FORESTER WAY',
'FORESTER WAY',
survey_list["Street / Block Name"]
)
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == '6 Zeigfeld',
'Ziegfeld Court',
survey_list["Street / Block Name"]
)
# Malin Lodge, Ronaldsway Close
survey_list["Street / Block Name"] = np.where(
survey_list["Street / Block Name"] == 'Malin Lodge, Ronaldsway Close',
'Malin Lodge',
survey_list["Street / Block Name"]
)
return survey_list
@staticmethod
def correct_ha50_survey_list(survey_list):
survey_list["Post Code"] = np.where(
(survey_list["Street / Block Name"] == 'COSELEY STREET') &
(survey_list["Post Code"] == 'ST16 1LR'),
"ST6 1JU",
survey_list["Post Code"]
)
# Remove some of COSELEY STREET, as we have surveys done, outside of the asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "COSELEY STREET") &
(survey_list["Post Code"] == "ST6 1JU") &
(survey_list["NO."].isin([96])))
]
survey_list["Post Code"] = survey_list["Post Code"].str.replace("ST33JZ", "ST3 3JZ")
# Remove some of Jesmond drive as we have surveys done outside of the asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Jesmond Drive") &
(survey_list["Post Code"] == "ST3 3JZ") &
(survey_list["NO."].isin([29])))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"BRUNDELL OVAL", "BRUNDALL OVAL"
)
# Remove 4 Linden Place
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Linden Place") &
(survey_list["Post Code"] == "ST3 3AT") &
(survey_list["NO."].isin([4])))
]
# Remove 11 Tilehurst Place
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Tilehurst Place") &
(survey_list["Post Code"] == "ST3 3AP") &
(survey_list["NO."].isin([11])))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"deavile road", "DEAVILLE ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"WOOLISCROFT ROAD", "WOOLLISCROFT ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Leak Road", "Leek Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Springfield road", "Springfields road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"MILLWARD RD", "MILLWARD ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"REPINGTON RD", "REPINGTON ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ECCELSTONE PLACE", "ECCLESTONE PLACE"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"St. James Place", "St James Place"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"CHELL HEATH RD", "CHELL HEATH ROAD"
)
# Correct postcode
survey_list["Post Code"] = np.where(
(survey_list["Street / Block Name"] == 'CHELL HEATH ROAD') &
(survey_list["Post Code"] == 'ST6 6HU'),
"ST6 6HJ",
survey_list["Post Code"]
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Franklin Rd", "Franklin Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Lodge Rd", "Lodge Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"St Matthews Street", "St Matthew Street"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Grove Bank Road", "Grovebank Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"OVERSLEY RD", "OVERSLEY ROAD"
)
# Replace all of the " RD" with " ROAD"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
" RD", " ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"St. Georges Crescent", "St Georges Crescent"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Tewson Road", "Tewson Green"
)
# Remove 55 Seabridge Lane
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Seabridge Lane") &
(survey_list["Post Code"] == "ST5 4AG") &
(survey_list["NO."].isin([55])))
]
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Tyne Way") &
(survey_list["Post Code"] == "ST5 4AX") &
(survey_list["NO."].isin([56])))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"St.Bernards Place", "St Bernard Place"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Penarth Road", "Penarth Grove"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"St. Marys Road", "St Marys Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Larch Drive", "Larch Grove"
)
# Drop 31 Lauder place north, as there is a duplicate. THis version also has a wrong postcode
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "LAUDER PLACE NORTH") &
(survey_list["Post Code"] == "ST20QS") &
(survey_list["NO."].isin([31])))
]
# Handle dropping of dupes
survey_list["street_pruner"] = survey_list["Street / Block Name"].str.lower().str.replace(" ", "")
survey_list["postcode_pruner"] = survey_list["Post Code"].str.lower().str.replace(" ", "")
# Should go to 18
survey_list = survey_list.drop_duplicates(["NO.", "street_pruner", "postcode_pruner"])
survey_list = survey_list.drop(columns=["street_pruner", "postcode_pruner"])
return survey_list
@staticmethod
def correct_ha107_survey_list(survey_list):
# Replace Front Street, East Stockham with Front Street, East Stockwith
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Front Street, East Stockham", "Front Street, East Stockwith"
)
# Replace "HONEYHOLE L;ANE" with "HONEYHOLES LANE"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"HONEYHOLE L;ANE", "HONEYHOLES LANE"
)
# Replace "Croft Lane Cherry Willingham, Lincoln" with "Croft Lane, Cherry Willingham, Lincoln"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Croft Lane Cherry Willingham, Lincoln", "Croft Lane, Cherry Willingham, Lincoln"
)
# Replace "Snelland Road Wickenby, Lincoln" with "Snelland Road, Wickenby, Lincoln"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln"
)
# Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln"
)
# Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln"
)
# Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln"
)
# Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln"
)
# Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln"
)
# Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln"
)
# Replace SPRINKHILL ROAD with SPINKHILL ROAD
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"SPRINKHILL ROAD", "SPINKHILL ROAD"
)
return survey_list
@staticmethod
def correct_ha41_survey_list(survey_list):
return survey_list
@staticmethod
def correct_ha12_survey_list(survey_list):
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Henstone Road", "Hanstone Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Lindern avenue", "Linden Avenue"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"priness way", "Princess Way"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Worth Crecesent", "Worth Crescent"
)
survey_list["Post Code"] = survey_list["Post Code"].str.replace(
"DY117HA", "DY11 7HA"
)
survey_list["Post Code"] = survey_list["Post Code"].str.replace(
"DY117HF", "DY11 7HF"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Adderbrook Crescent", "Addenbrooke Crescent"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Kinver Road", "Kinver Avenue"
)
return survey_list
@staticmethod
def correct_ha13_survey_list(survey_list):
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Woodfarm Road", "WOOD FARM ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ALLANDALE ROAD", "ALLANDALE"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"NEWFIELDS LANE", "NEWFIELD LANE"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"BROADFIELDS ROAD", "BROADFIELD ROAD"
)
survey_list["Post Code"] = survey_list["Post Code"].str.replace(
"HP2 5SF+", "HP2 5SF"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"PESCOTT HILL", "PESCOT HILL"
)
# This is a duplicate record
survey_list = survey_list[
~((survey_list["NO."] == 33) &
(survey_list["Street / Block Name"] == "Turners Hill") &
(survey_list["Post Code"] == "HP2 4LH") &
(survey_list["INSTALLED OR CANCELLED"] == "NO UPDATE - CHECKED 18.12.23"))
]
return survey_list
@staticmethod
def correct_ha18_survey_list(survey_list):
return survey_list
@staticmethod
def correct_ha35_survey_list(survey_list):
return survey_list
@staticmethod
def correct_ha34_survey_list(survey_list):
# Note in the asset list
survey_list = survey_list[
survey_list["Post Code"] != "L5 3SS"
]
survey_list["Post Code"] = survey_list["Post Code"].str.replace(
"L177DR", "L17 7DR"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"PENVALLEY CRESENT", "Penvalley Crescent"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"PENLINKEN DRIVE", "Penlinken Drive"
)
# There's no 32 Penlinken Drive in the asset sheet
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Penlinken Drive") &
(survey_list["NO."] == 32))
]
# There's no 30 Gwent Street in the asset sheet
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "GWENT ST") &
(survey_list["NO."] == 30))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"POULTON RD", "Poulton Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ST PAULS RD", "St Pauls Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"BROAD LANE, KIRKBY", "BROAD LANE"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"BULLENS RD, KIRKBY", "Bullens Road"
)
# There's no 219 NORTH HILL ST in the asset sheet
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "NORTH HILL ST") &
(survey_list["NO."] == 219))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"CROSLAND RD, KIRKBY", "CROSLAND ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"PARK BROW DRIVE, KIRKBY", "Park Brow Drive"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"CELTIC TREET", "Celtic Street"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"BUCKLAND ROAD", "Buckland Street"
)
# duplicates
survey_list = survey_list.drop_duplicates(["Street / Block Name", "NO.", "Post Code"])
# This is a duplicate with wrong postcode
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "CLARIBEL STREET") &
(survey_list["NO."] == 7) &
(survey_list["Post Code"] == "L8 8AF"))
]
survey_list["NO."] = np.where(
((survey_list["NO."] == "187 A") &
(survey_list["Post Code"] == "L32 6QF")),
"187A",
survey_list["NO."]
)
return survey_list
@staticmethod
def correct_ha56_survey_list(survey_list):
# Not in asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Samual Street") &
(survey_list["NO."].isin([22, 24])) &
(survey_list["Post Code"] == "WA5 1BB"))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"STOURTON RD", "Stourton Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"BIRKIN RD", "Birkin Road"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"PORTLAND RD", "Portland Road"
)
# We remove a row, because two rows match to a block listing
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Tavlin Avenue") &
(survey_list["NO."] == 17) &
(survey_list["Post Code"] == "WA5 0EN"))
]
return survey_list
@staticmethod
def correct_ha30_survey_list(survey_list):
survey_list = survey_list[~pd.isnull(survey_list["Post Code"])]
# Split on / and take the first half
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0]
# Not in the asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Horsebridge Road") &
(survey_list["NO."] == 286))
]
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "DUTTON WAY") &
(survey_list["NO."] == 9))
]
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "PAYTHORNE CLOSE") &
(survey_list["NO."] == 10))
]
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "MARCHWOOD ROAD") &
(survey_list["NO."] == 11))
]
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Otterburn Close") &
(survey_list["NO."] == 4))
]
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Blossom Court") &
(survey_list["NO."] == 5))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"St LUKES CLOSE , HUNTINGDON", "St. Lukes Close"
)
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "St. Lukes Close") &
(survey_list["NO."].isin([4, 7, 8])))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ROMAN WAY , GODMANCHESTER , HUNTINGDON", "Roman Way"
)
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Roman Way") &
(survey_list["NO."].isin([58])))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"HEADLANDS , FENSTANTON , HUNTINGDON", "Headlands Fenstanton"
)
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Headlands Fenstanton") &
(survey_list["NO."].isin([126, 134])))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"WALLACE COURT , HUNTINGDON", "Wallace Court"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"CRICKETERS WAY , CHATTERIS", "Cricketers Way"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Jubilee Gardens", "Jubilee Green"
)
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "Harrow Road") &
(survey_list["NO."].isin([10])))
]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ST LUKES CLOSE", "St. Lukes Close"
)
return survey_list
@staticmethod
def correct_ha49_survey_list(survey_list):
return survey_list
@staticmethod
def correct_ha8_survey_list(survey_list):
# Split on / and take the first half
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0]
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"WESTONIA COURT HOUSE", "Westonia Court"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Hillesdon Avenue", "Hillesden Avenue"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Weston Street", "Western Street"
)
# Remove placeholder rows where postcode is missing
survey_list = survey_list[
~pd.isnull(survey_list["Post Code"])
]
return survey_list
@staticmethod
def correct_ha11_survey_list(survey_list):
# Remove 39 HOLLYWOOD WAY as it's not in the asset list
survey_list = survey_list[
~((survey_list["Street / Block Name"] == "HOLLYWOOD WAY") &
(survey_list["NO."] == 39))
]
return survey_list
@staticmethod
def correct_ha42_survey_list(survey_list):
# original asset list has nothing in the street
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Turnstone Terrace", ""
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Pegasus place", ""
)
return survey_list
@staticmethod
def correct_ha45_survey_list(survey_list):
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Norwich Road", "Norwich Avenue"
)
return survey_list
@staticmethod
def correct_ha51_survey_list(survey_list):
survey_list = survey_list.rename(columns={"NO ": "NO."})
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Autum Close", "Autumn Close"
)
return survey_list
@staticmethod
def correct_ha52_survey_list(survey_list):
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Mardalle Avenue", "Mardale Avenue"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Ollerton Close, Grappenhall", "Ollerton Close"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Bradshaw Road, Grappenhall", "Bradshaw Lane"
)
# Drop a bunch of dupes
survey_list = survey_list.drop_duplicates(["NO.", "Street / Block Name", "Post Code"])
return survey_list
@staticmethod
def correct_ha5_survey_list(survey_list):
return survey_list
@staticmethod
def correct_ha20_survey_list(survey_list):
# Not in the asset list
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Abbot Close", "ABBOTS CLOSE"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Downbarns Road", "DOWN BARNS ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Austin Lane", "AUSTINS LANE"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"South Park Way", "SOUTHPARK WAY"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"OAKLAND ROAD", "OAKWOOD ROAD"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"ACRE WAY/NORTHWOOD", "ACRE WAY"
)
return survey_list
@staticmethod
def levenstein_match(matching_string, df):
match_to = df["matching_address"].tolist()
# Strip out punctuation and spaces
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
match_to = [x.replace(" ", "") for x in match_to]
# Perform matching between full key and match_to
distances = [100 - fuzz.ratio(matching_string, s) for s in match_to]
best_match_index = distances.index(min(distances))
# We might want to consider a threshold for the distance, however for the momeny,
# we don't consider this for the moment
df = df.iloc[best_match_index:best_match_index + 1]
return df
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
# Correct the survey list
survey_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_survey_list")
survey_list = survey_list_correction_function(survey_list)
missed_postcodes = []
if ha_name in ["HA6", "HA34"]:
missed_postcodes = [
postcode.lower() for postcode in survey_list["Post Code"] if
postcode.lower() not in asset_list["matching_postcode"].values
]
if ha_name == "HA13":
missed_postcodes = ["hp17 8le"]
if ha_name == "HA56":
# Multiple properties are listed as blocks, which is a problem for matching
missed_postcodes = ["sk17 6nr", "wa5 0en"]
matching_lookup = []
for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
house_number = row["NO."]
if isinstance(house_number, str):
house_number = house_number.lower().strip()
# Filter on the first line of the address
df = asset_list[
asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip())
].copy()
if not any(df["matching_address"].str.contains(str(house_number))):
if "flat" in str(house_number):
house_number = house_number.split("flat")[1].strip()
# We check if we had an instance of flat x, y
if "," in str(house_number):
house_number = house_number.split(",")[0].strip()
# We may also have a space for an instance of flat x y
if " " in str(house_number):
house_number = house_number.split(" ")[0].strip()
df = df[df["matching_address"].str.contains(str(house_number))]
if df.empty:
postcode_lower = row["Post Code"].lower()
if postcode_lower in missed_postcodes:
matching_lookup.append(
{
"survey_list_row_id": row["survey_list_row_id"],
"asset_list_row_id": None,
}
)
continue
print(row["Street / Block Name"])
print(house_number)
print(row["Post Code"])
raise ValueError("Investigate")
if df.shape[0] != 1:
df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
if df.shape[0] != 1:
df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())]
if df.empty:
postcode_lower = row["Post Code"].lower()
if postcode_lower in missed_postcodes:
matching_lookup.append(
{
"survey_list_row_id": row["survey_list_row_id"],
"asset_list_row_id": None,
}
)
continue
if df.shape[0] != 1:
if "Town/Area" not in row.keys():
full_key = (str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() +
row["Post Code"].lower().strip())
else:
full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + \
row["Town/Area"].lower().strip() + row["Post Code"].lower().strip()
# Remove any spaces from the full key
full_key = full_key.replace(" ", "")
df = self.levenstein_match(full_key, df)
if df.shape[0] != 1:
print(row["Street / Block Name"])
print(house_number)
print(row["Post Code"])
raise ValueError("Investigate")
matching_lookup.append(
{
"survey_list_row_id": row["survey_list_row_id"],
"asset_list_row_id": df["asset_list_row_id"].values[0],
}
)
matching_lookup = pd.DataFrame(matching_lookup)
if matching_lookup.shape[0] != survey_list.shape[0]:
raise ValueError("Mismatch in the number of survey rows and matching lookup rows")
matching_lookup = matching_lookup[~pd.isnull(matching_lookup["asset_list_row_id"])]
if matching_lookup["asset_list_row_id"].duplicated().sum():
raise ValueError("Duplicated matches in survey list")
# Merge onto the survey list
survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id")
# TEMP FOR NEWER WORK
# matching_lookup = matching_lookup.merge(
# asset_list[["asset_list_row_id", "UPRN"]], how="left", on="asset_list_row_id"
# ).merge(
# survey_list[["survey_list_row_id", "NO.", "Street / Block Name", "Post Code"]],
# how="left", on="survey_list_row_id"
# )
# matching_lookup.to_csv(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane/surveys_to_assets.csv"
# )
return survey_list
@staticmethod
def correct_ha25_eco3_list(eco3_list):
# NEADS DRIVE, postcode with bs305dt, is not found in the asset list
eco3_list = eco3_list[
~(eco3_list["Post Code"] == "BS305DT")
]
# Drop rows with missings postcode
eco3_list = eco3_list[
~pd.isnull(eco3_list["Post Code"])
]
# We have a bunch of genuine duplicates
eco3_list = eco3_list.drop_duplicates(["NO ", "Street / Block Name", "Post Code"])
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"HALWILL MEADOOW", "HALWILL MEADOW"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"Hall Road", "Hall Rd"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"SPRINGFIELD WAY SAINT DAY", "SPRINGFIELD WAY ST DAY"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"BOND SPEAR COURT", "BOND-SPEAR COURT"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"ST.MARYS HILL", "ST MARYS HILL"
)
# Correct the postcode for edmund road
eco3_list["Post Code"] = np.where(
(eco3_list["Street / Block Name"] == "EDMUND ROAD") &
(eco3_list["Post Code"] == "TR14 8QJ"),
"TR15 1BY",
eco3_list["Post Code"]
)
return eco3_list
@staticmethod
def correct_ha50_eco3_list(eco3_list):
return eco3_list
@staticmethod
def correct_ha41_eco3_list(eco3_list):
return eco3_list
@staticmethod
def correct_ha63_eco3_list(eco3_list):
eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]
# Some postcode that aren't in the asset list
eco3_list = eco3_list[
~eco3_list["Post Code"].isin(
["NR32 15X", "NR30 2BT"]
)
]
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"POUND COTTAGES - BLOOMSBERRY CLOSE", "POUND COTTAGES"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"FREDRICK ROAD", "Frederick Road"
)
# For denmark street, remove the space from the house number
eco3_list["NO "] = np.where(
eco3_list["Street / Block Name"] == "DENMARK STREET",
eco3_list["NO "].str.replace(" ", ""),
eco3_list["NO "]
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"OLD HOSPITAL MEWS HOSPITAL WALK", "Old Hospital Mews"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"Portland House, Portland Street", "Portland House"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"MIDDLE MARKET STREET", "Middle Market Road"
)
return eco3_list
@staticmethod
def correct_ha117_eco3_list(eco3_list):
# Delete rows where postcode is null - there are some placeholder rows where this happens
eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"TARRING ROAD", "155 TARRING ROAD"
)
return eco3_list
@staticmethod
def correct_ha56_eco3_list(eco3_list):
eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"Mount Pleasant, Crewe", "Mount Pleasant"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"Dutton Close", "Dutton Way"
)
eco3_list["Post Code"] = eco3_list["Post Code"].str.replace(
"Ls63nl", "LS6 3NL"
)
# Handle a duplicate
eco3_list = eco3_list[
~((eco3_list["Street / Block Name"] == "Mount Pleasant") &
(eco3_list["Post Code"] == "CW1 3JF") &
(eco3_list["NO "] == 5) &
(eco3_list["INSTALL/ CANCELLATION DATE"] == "CANCELLED 20.5.2022"))
]
return eco3_list
@staticmethod
def correct_ha51_eco3_list(eco3_list):
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"HASELEMERE AVENUE", "HASLEMERE AVENUE"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"THORVILLE GROVE", "THORNVILLE GROVE"
)
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
"MONTBRETA CLOSE", "MONTBRETIA CLOSE"
)
eco3_list["Post Code"] = np.where(
(eco3_list["Street / Block Name"] == "SYDENHAM ROAD") &
(eco3_list["Post Code"] == "CR0 2DW"),
"CR0 2ED",
eco3_list["Post Code"]
)
# Not in asset list
eco3_list = eco3_list[
~((eco3_list["Street / Block Name"] == "WOODLEY LANE") &
(eco3_list["Post Code"] == "SM5 2RJ") &
(eco3_list["NO "] == "FLAT 3, 11"))
]
eco3_list["NO "] = np.where(
(eco3_list["NO "] == "47 B"),
"47B",
eco3_list["NO "]
)
return eco3_list
def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list")
eco3_list = eco3_list_correction_function(eco3_list)
asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower()
eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "")
if ha_name in ["HA25", "HA56", "HA51"]:
# HA25: 317 -> 259
missed_postcodes = {
postcode for postcode in eco3_list["postcode_no_space"] if
postcode not in asset_list["matching_postcode_nospace"].values
}
eco3_list = eco3_list[~eco3_list["postcode_no_space"].isin(missed_postcodes)]
# For the asset list, we create a matching address without any punctuation
# TODO: We should generally just remove puncutation from addresses when matching
asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(
r'[^\w\s]', '', regex=True
)
# Remove double spaces
asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace(
" ", " "
)
matching_lookup = []
missed = []
for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
# if row["eco3_list_row_id"] == "HA51_Eco3_22":
# raise Exception()
postcode = row["postcode_no_space"]
# df will never be empty, since we've already done a check for common postcodes
df = asset_list[
asset_list["matching_postcode_nospace"].str.contains(postcode)
]
house_number = row["NO "]
if isinstance(house_number, str):
house_number = house_number.lower().strip()
if not any(df["HouseNo"].str.contains(str(house_number))):
if "flat" in str(house_number):
house_number = house_number.split("flat")[1].strip()
# We check if we had an instance of flat x, y
if "," in str(house_number):
house_number = house_number.split(",")[0].strip()
# We may also have a space for an instance of flat x y
if " " in str(house_number):
house_number = house_number.split(" ")[0].strip()
# We must do the house number filter
df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
# Perform a search on streetname
# We do this to prevent duplicate matches to properties with the same postcode and house number,
# but different streets
street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0]
street_name_section1 = re.sub(r'[^\w\s]', '', street_name_section1)
df = df[df["matching_address_no_punctuation"].str.contains(street_name_section1)]
if df.empty:
missed.append(row["eco3_list_row_id"])
continue
if df.shape[0] > 1:
if "flat" in str(row["NO "]).lower():
df = df[df["matching_address"].str.contains("flat")]
else:
df = df[~df["matching_address"].str.contains("flat")]
if df.shape[0] != 1:
print(row["Street / Block Name"])
print(house_number)
print(row["Post Code"])
raise ValueError("Investigate")
matching_lookup.append(
{
"eco3_list_row_id": row["eco3_list_row_id"],
"asset_list_row_id": df["asset_list_row_id"].values[0],
}
)
# We verify the missed
# HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2
# where many surveys were conducted on house numbers, not in the asset list
# 154 missed, 2827 matched for HA 25
# For HA56, the number of missed is high at 320, however a big portion of these are due to the block being
# listed in the asset list, and individual units being in the survey list
if len(missed) != self.UNMATCHED_ECO3[ha_name]:
raise ValueError(
f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched"
)
matching_lookup = pd.DataFrame(matching_lookup)
# Check dupes as this will cause problems later on
if matching_lookup["asset_list_row_id"].duplicated().sum():
raise ValueError("Duplicated asset list row ids")
# Merge onto eco3 list
eco3_list = eco3_list.merge(matching_lookup, how="left", on="eco3_list_row_id")
asset_list.drop(columns=["matching_address_no_punctuation"], inplace=True)
return eco3_list
@staticmethod
def extract_streetname(address, house_number=None, postcode=None):
"""
Cleans an address by removing the house number and postcode, and converts everything to lower case.
:param address: The full address as a string.
:param house_number: The house number to remove, as a string or integer.
:param postcode: The postcode to remove, as a string.
:return: The cleaned address.
"""
# Convert everything to lower case
address = address.lower()
if house_number is not None:
# Remove the house number
address = re.sub(r'\b{}\b'.format(house_number), '', address, flags=re.IGNORECASE).strip()
if postcode is not None:
# Remove the postcode
address = re.sub(r'\b{}\b'.format(re.escape(postcode)), '', address, flags=re.IGNORECASE).strip()
# Get first section before a comma
address = address.split(",")[0]
# Additional cleaning to remove extra spaces and commas left over
address = re.sub(r'\s+', ' ', address) # Replace multiple spaces with a single space
address = re.sub(r'\s*,\s*', ', ', address) # Clean up space around commas
return address
def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name):
matching_lookup = []
unmatched_addresses = []
for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)):
house_number = row["HouseNo"]
if isinstance(house_number, str):
house_number = house_number.lower().strip()
# Filter on the postcode
df = asset_list[
asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip())
].copy()
df = df[df["HouseNo"].astype(str) == str(house_number)]
# For ciga, we skip
if df.empty:
unmatched_addresses.append(
{
"ciga_list_row_id": row["ciga_list_row_id"],
"HouseNo": house_number,
"Matched Postcode": row["Matched Postcode"]
}
)
continue
if df.shape[0] != 1:
# We split house number and postcode out of the matched address for ciga
street_name = self.extract_streetname(
address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"]
)
# We check if any of the rows contains the street name and if they do, filter
if any(df["matching_address"].str.replace(",", "").str.contains(street_name)):
df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]
if df.shape[0] != 1:
# The final check we do here is to check for the presence of flat in the address
if "flat" in row["Matched Address"].lower():
df = df[df["matching_address"].str.contains("flat")]
else:
df = df[df["matching_address"].str.contains("flat") == False]
if df.shape[0] != 1:
full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[
"Matched Postcode"].lower().strip()
# Remove any spaces from the full key
full_key = full_key.replace(" ", "")
df = self.levenstein_match(full_key, df)
if df.shape[0] != 1:
print(row["Street / Block Name"])
print(house_number)
print(row["Post Code"].lower())
raise ValueError("Investigate")
matching_lookup.append(
{
"ciga_list_row_id": row["ciga_list_row_id"],
"asset_list_row_id": df["asset_list_row_id"].values[0],
}
)
# We have an acceptable number of ciga failures for each HA
if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
raise ValueError(
f"Unmatched addresses for {ha_name} is not as expected, got {len(unmatched_addresses)} unmatched")
matching_lookup = pd.DataFrame(matching_lookup)
# Check dupes as this will cause problems later on
if matching_lookup["asset_list_row_id"].duplicated().any():
raise ValueError("Duplicated asset list row ids")
# Merge onto the ciga list
ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id")
return ciga_list
@staticmethod
def identify_built_form_ha6(property_string):
"""
Identify the built form of a property from the given string.
:param property_string: The string describing the property
:return: The identified built form, or None if it cannot be identified
"""
# Define keywords for each built form
built_forms = {
'Semi-Detached': ['semi detached'],
'Detached': ['detached'],
'Mid-Terrace': ['mid terrace', 'mid town house'],
'End-Terrace': ['end terrace', 'end town house']
}
# Normalize the input string to lower case for comparison
property_string_normalized = property_string.lower()
# Search for each built form keyword in the input string
for built_form, keywords in built_forms.items():
for keyword in keywords:
if keyword in property_string_normalized:
return built_form
# Return None if no built form is identified
return None
def load(self):
# Get the december figures, which is just a csv
self.december_figures = pd.read_csv(self.december_figures_filepath)
# Remove the spaces in HA Name
self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "")
for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]:
self.december_figures[col] = self.december_figures[col].astype("Int64")
if self.use_cache and not self.rebuild:
data = read_pickle_from_s3(
bucket_name="retrofit-datalake-dev",
s3_file_name="ha-analysis/batch3-inputs.pickle",
)
else:
data = {}
for filepath in self.directories:
ha_name = filepath.split("/")[2]
if ha_name in data:
continue
# Load asset list
logger.info("Loading data for {}".format(ha_name))
asset_list, survey_list, ciga_list, eco3_list = self.load_asset_list(
filepath=filepath,
ha_name=ha_name,
)
data[ha_name] = {
"asset_list": asset_list,
"survey_list": survey_list,
"ciga_list": ciga_list,
"eco3_list": eco3_list
}
self.data = data
# Cache the data in s3
# We need to pickle the data and store in s3
save_pickle_to_s3(
data=self.data,
bucket_name="retrofit-datalake-dev",
s3_file_name="ha-analysis/batch3-inputs.pickle",
)
def ha_facts_and_figures(self):
"""
This function will return a dictionary of facts and figures for each HA
:return:
"""
scheme_map = {
"ECO4": "ECO4",
"AFFORDABLE WARMTH": "ECO4",
"ECO4 A/W": "ECO4",
"ECO4 GBIS (ECO+)": "GBIS",
"ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS",
"ECO4 AFFORDABLE WARMTH": "ECO4",
"Affordable Warmth": "ECO4",
"ECO4 GBIS (ECO+) JJC UNDER 73m² ": "GBIS",
"ECO4 PPS": "ECO4",
"AFFORDABLE WARMTH / REMEDIAL": "ECO4",
"AFF0RDALE WARMTH": "ECO4",
"ECO 4 RdSAP CL": "ECO4",
"Affordable Warmth (R) ": "ECO4",
"Affordable Warmth ": "ECO4",
"ECO 4 AFFORDABLE WARMTH": "ECO4",
}
# Since it seems like "subject to archetype check" has some failure conditions, for simplicity, we
# treat these as similar to subject to CIGA, and therefore unconfirmed worked that could fail. There
# are only a small volume of properties for which we see this
eco_eligibility_map = {
"not eligble": "not eligible",
"eco 4(subject to ciga)": "eco4 (subject to ciga)",
"eco4 (subject to ciga/archetype check": "eco4 (subject to ciga) (subject to archetype)",
"eco4 (subject to archetype check)": "eco4 (subject to archetype)",
"eco4 (subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)",
"eco4 (subject to ciga)": "eco4 (subject to ciga)",
"eco4(subject to ciga)": "eco4 (subject to ciga)",
"eco4 subject to ciga": "eco4 (subject to ciga)",
"eco4 (subject to archetype/ciga)": "eco4 (subject to ciga) (subject to archetype)",
"eco4( subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)",
"eco4 (subject to ciga/ archetype)": "eco4 (subject to ciga) (subject to archetype)",
}
ha_facts_and_figures = []
for ha_name, data_assets in self.data.items():
asset_list = data_assets["asset_list"].copy()
survey_list = data_assets["survey_list"].copy()
ciga_list = data_assets["ciga_list"].copy()
eco3_list = data_assets.get("eco3_list", pd.DataFrame())
asset_list_starting_size = asset_list.shape[0]
# Change the column name if it's ECO eligibility
asset_list = asset_list.rename(
columns={
"ECO eligibility": "ECO Eligibility",
"ECO Eligibilty": "ECO Eligibility",
},
)
# Remove surplus whitespace from the ECO Eligibility column
asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.strip()
# Push to lower case
asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.lower()
# Remap
asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].replace(eco_eligibility_map)
if not ciga_list.empty:
# We merge on ciga and update the status to reflect if it has failed ciga or not
# If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA
# check
ciga_list_to_merge = ciga_list[["asset_list_row_id", "Guarantee"]].copy()
ciga_list_to_merge = ciga_list_to_merge[~pd.isnull(ciga_list_to_merge["asset_list_row_id"])]
asset_list = asset_list.merge(ciga_list_to_merge, how='left', on="asset_list_row_id")
asset_list["ECO Eligibility"] = np.where(
(
asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) &
(asset_list["Guarantee"] == "Yes")
),
"failed ciga",
asset_list["ECO Eligibility"]
)
# We replace any remaining "Subject to CIGA" with pass Ciga
asset_list["ECO Eligibility"] = np.where(
(
asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) &
(asset_list["Guarantee"] == "No")
),
"eco4 - passed ciga",
asset_list["ECO Eligibility"]
)
asset_list = asset_list.drop(columns=["Guarantee"])
# Update the asset list with the categorisations and rename changes
if asset_list.shape[0] != asset_list_starting_size:
raise ValueError("The asset list has changed in size")
# If we have eco3 surveys, we set a property to not eligible
if not eco3_list.empty:
eco3_list_to_merge = eco3_list[["asset_list_row_id"]].copy()
eco3_list_to_merge["has_eco3"] = True
asset_list = asset_list.merge(
eco3_list_to_merge, how="left", on="asset_list_row_id"
)
if asset_list.shape[0] != asset_list_starting_size:
raise ValueError("The asset list has changed in size, when merging on eco3")
# Any rows that have an eco3 survey are set to not eligible
asset_list["ECO Eligibility"] = np.where(
asset_list["has_eco3"] == True,
"not eligible",
asset_list["ECO Eligibility"]
)
# asset_list = asset_list.drop(columns=["has_eco3"])
# Report on sales
sales_report = {}
if not survey_list.empty:
scheme_column = survey_list.columns[0]
# Remap the values in the scheme column
survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map)
# We clean up the survey list installation or cancelled
if "INSTALLED OR CANCELLED" in survey_list.columns:
survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower()
# Remove all punctuation
survey_list["installed_or_cancelled_clean"] = survey_list[
"installed_or_cancelled_clean"].str.replace(
r'[^\w\s]', '', regex=True
)
# Remove double spaces
survey_list["installed_or_cancelled_clean"] = survey_list[
"installed_or_cancelled_clean"].str.replace(
r'\s+', ' ', regex=True
)
# Remove trailing spaces
survey_list["installed_or_cancelled_clean"] = survey_list[
"installed_or_cancelled_clean"].str.strip()
survey_list["installation_status"] = None
survey_list["installation_status"] = np.where(
survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]),
"installed",
survey_list["installation_status"]
)
survey_list["installation_status"] = np.where(
survey_list["installed_or_cancelled_clean"].isin(["cancelled"]),
"cancelled",
survey_list["installation_status"]
)
# Find partial installations
survey_list["installation_status"] = np.where(
survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"),
"in progress",
survey_list["installation_status"]
)
# Find partial cancellations
# TODO: We might have more indications of partial cancellations
survey_list["installation_status"] = np.where(
survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]),
"cancelled",
survey_list["installation_status"]
)
else:
# We have some examples, e.g. HA28, where we do not have the installed or cancelled column
if 'INSTALL/ CANCELLATION DATE' in survey_list.columns:
survey_list["installation_status"] = np.where(
survey_list['INSTALL/ CANCELLATION DATE'].str.lower().str.contains("cancelled"),
"cancelled",
"installed",
)
else:
survey_list["installation_status"] = np.where(
survey_list['INSTALL / CANCELLATION DATE'].str.lower().str.contains("cancelled"),
"cancelled",
"installed",
)
# Finally, for other cases, we set the status to "in progress"
survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress")
# We concatenate the scheme name with the installation status
survey_list["installation_status"] = (
survey_list[scheme_column] + " - " + survey_list["installation_status"]
)
# We get the sales
sales_report = {
"ECO4 - surveys sold": survey_list.shape[0],
**survey_list["installation_status"].value_counts().to_dict()
}
# We find some cases where properties have sold but are missing CIGA checks
survey_list_to_merge = survey_list[["asset_list_row_id", "installation_status"]].copy()
survey_list_to_merge["has_a_survey_record"] = True
survey_list_to_merge = survey_list_to_merge[~pd.isnull(survey_list_to_merge["asset_list_row_id"])]
asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id")
# Update the cases where properties have sold, but are missing a CIGA check
# If we don't have a CIGA list, we set the value to ECO4
set_to = "eco4 - passed ciga" if not ciga_list.empty else "eco4"
asset_list["ECO Eligibility"] = np.where(
(asset_list["ECO Eligibility"].str.contains("subject to ciga")) & (
asset_list["has_a_survey_record"] == True
),
set_to,
asset_list["ECO Eligibility"]
)
# Update the cases where a property has been marked as eligible for GBIS, but sold for ECO4
asset_list["ECO Eligibility"] = np.where(
(asset_list["ECO Eligibility"] == "gbis") & (
asset_list["installation_status"].isin(
["ECO4 - installed", "ECO4 - cancelled", "ECO4 - in progress"]
)
),
"eco4",
asset_list["ECO Eligibility"]
)
# Update the cases where a property was marked as eligible for ECO4, but sold for GBIS
asset_list["ECO Eligibility"] = np.where(
(asset_list["ECO Eligibility"].isin(
[
"eco4",
"eco4 (subject to ciga)",
"eco4 - passed ciga",
"failed ciga",
"eco4 (subject to archetype)",
"eco4 (subject to ciga) (subject to archetype)"
]
)) & (
asset_list["installation_status"].isin(
["GBIS - installed", "GBIS - cancelled", "GBIS - in progress"]
)
),
"gbis",
asset_list["ECO Eligibility"]
)
# Update the cases where a property is marked as not eligible, but sold for GBIS
asset_list["ECO Eligibility"] = np.where(
(asset_list["ECO Eligibility"] == "not eligible") & (
asset_list["installation_status"].isin(
["GBIS - in progress", "GBIS - installed", "GBIS - cancelled"]
)),
"gbis",
asset_list["ECO Eligibility"]
)
# Update the cases where a property is marked as not eligible, but sold for ECO4
asset_list["ECO Eligibility"] = np.where(
(asset_list["ECO Eligibility"] == "not eligible") & (
asset_list["installation_status"].isin(
["ECO4 - in progress", "ECO4 - installed", "ECO4 - cancelled"]
)
),
"eco4",
asset_list["ECO Eligibility"]
)
asset_list = asset_list.drop(columns=["has_a_survey_record", "installation_status"])
# Update the survey list with installation status
self.data[ha_name]["survey_list"] = survey_list
# Insert updated asset list
self.data[ha_name]["asset_list"] = asset_list
ha_facts_and_figures.append(
{
"HA Name": ha_name,
**asset_list["ECO Eligibility"].value_counts().to_dict(),
**sales_report
}
)
ha_facts_and_figures = pd.DataFrame(ha_facts_and_figures)
ha_facts_and_figures = ha_facts_and_figures.drop(
columns=["not eligible"]
)
ha_facts_and_figures = ha_facts_and_figures.fillna(0)
# Make all columns apart from HA NAme integers
for col in ha_facts_and_figures.columns[1:]:
ha_facts_and_figures[col] = ha_facts_and_figures[col].astype(int)
ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name")
ha_facts_and_figures = ha_facts_and_figures.fillna(0)
self.facts_and_figures = ha_facts_and_figures
def get_property_type_and_built_form(property_meta, ha_name):
if ha_name in ["HA44"]:
return None, None
if ha_name == "HA1":
property_type = property_meta["Asset Type"]
# We correct a small error
if property_type == "a":
property_type = "House"
# Remap bedsits to flats
if property_type in ["Bedsit", "Room"]:
property_type = "Flat"
built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"], None)
elif ha_name == "HA2":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type"].strip())
built_form = None
elif ha_name == "HA5":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
built_form = None
elif ha_name == "HA6":
property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]]
built_form = property_meta["built_form"]
elif ha_name == "HA7":
property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["Archetype"])
built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"])
elif ha_name == "HA8":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HA9":
property_description = property_meta["Asset Type"].strip().lower()
if "house" in property_description:
return "House", None
if "flat" in property_description:
return "Flat", None
if "bungalow" in property_description:
return "Bungalow", None
if "maisonette" in property_description:
return "Maisonette", None
return None, None
elif ha_name == "HA11":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HA12":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset_Type1"].strip())
built_form = None
elif ha_name == "HA13":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Type Cd"].strip())
built_form = None
elif ha_name == "HA14":
if property_meta["Asset Type Description"] == "Block - Repair":
# We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address
if "room" in property_meta["Address 1"].lower():
property_type = "House"
else:
property_type = "Flat"
else:
property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][
property_meta["Asset Type Description"]
]
built_form = None
elif ha_name == "HA15":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HA16":
config = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Type"]]
property_type = config.get("property-type")
built_form = config.get("built-form")
elif ha_name == "HA17":
return property_meta["property_type"], None
elif ha_name == "HA18":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
built_form = None
elif ha_name == "HA19":
property_type = property_meta["Dwelling Type"]
built_form = None
elif ha_name == "HA20":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
built_form = None
elif ha_name == "HA21":
property_description = property_meta["Property Type"].strip().lower()
if "house" in property_description:
return "House", None
if "flat" in property_description:
return "Flat", None
if "bungalow" in property_description:
return "Bungalow", None
if "maisonette" in property_description:
return "Maisonette", None
return None, None
elif ha_name == "HA24":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HA25":
property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["T1_AssetType"]]
built_form = None
elif ha_name == "HA27":
property_type = property_meta["Property Type"]
built_form = None
elif ha_name == "HA28":
property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Property Type - Academy"]]
built_form = None
elif ha_name == "HA30":
property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["A_AssetType"]]
built_form = None
elif ha_name == "HA31":
property_description = property_meta["A_AssetType"].strip().lower()
if "house" in property_description:
return "House", None
if "flat" in property_description:
return "Flat", None
if "bungalow" in property_description:
return "Bungalow", None
if "maisonette" in property_description:
return "Maisonette", None
return None, None
elif ha_name == "HA32":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling type"].strip())
built_form = None
elif ha_name == "HA34":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HA35":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type Grouping"].strip())
built_form = None
elif ha_name == "HA37":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PROPERTY TYPE"].strip())
built_form = None
elif ha_name == "HA39":
property_type_config = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["ConstructionStyle"], {})
property_type = property_type_config.get("property_type", None)
built_form = property_type_config.get("built_form", None)
if property_type is None:
# We check for the presence of room or flat
if "flat" in property_meta["matching_address"]:
property_type = "Flat"
else:
property_type = "House"
elif ha_name == "HA41":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Archetype"].strip())
built_form = None
elif ha_name == "HA42":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling use/type"].strip())
built_form = None
elif ha_name == "HA45":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property type"].strip())
built_form = None
elif ha_name == "HA48":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HA49":
property_type = property_meta["Property Class"].strip()
built_form = None
elif ha_name == "HA50":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HA51":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
built_form = None
elif ha_name == "HA52":
if property_meta["Property Type"] is None:
return None, None
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HA54":
property_type = property_meta["Property Type"]
built_form = None
elif ha_name == "HA56":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type Description"].strip())
built_form = None
elif ha_name == "HA63":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PropertyType"].strip())
built_form = None
elif ha_name == "HA107":
property_type = property_meta.get("property_type", None)
built_form = property_meta.get("built_form", None)
elif ha_name == "HA117":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
built_form = None
elif ha_name == "HAXX":
return property_meta["Property Type"].split(":")[0].strip(), None
elif ha_name == "HAXXX":
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Unit Description"].strip())
built_form = None
else:
raise NotImplementedError("Implement me")
return property_type, built_form
def get_epc_data(
loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
):
if not loader.data:
raise ValueError("Data not found - please run loader.load() first")
outputs = {}
for ha_name, data_assets in loader.data.items():
if not pull_data:
# Then we retrieve the data from S3
processed_ha_results = read_pickle_from_s3(
bucket_name="retrofit-datalake-dev",
s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
)
outputs[ha_name] = {
"results_df": processed_ha_results["results_df"],
"scoring_df": processed_ha_results["scoring_df"],
"nodata": processed_ha_results["nodata"]
}
continue
# For each HA, we read pull in the data required, and store in S3
asset_list = data_assets["asset_list"].copy()
# If the survey list is missing, it means we have no yet completed any surveys and therefore should only
# consider the most recent EPC
consider_penultimate_epc = data_assets["survey_list"] is not None
# We iterate through the asset list and pull what we need
results = []
scoring_data = []
nodata = []
failed_model_rows = []
for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
if property_meta["matching_postcode"] is None:
continue
property_type, built_form = get_property_type_and_built_form(
property_meta=property_meta, ha_name=ha_name
)
searcher = SearchEpc(
address1=str(property_meta["HouseNo"]),
postcode=property_meta["matching_postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
full_address=property_meta["matching_address"]
)
searcher.ordnance_survey_client.property_type = property_type
searcher.ordnance_survey_client.built_form = built_form
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
nodata.append(property_meta)
continue
if searcher.newest_epc.get("estimated"):
# We insert the row ID as our proxy for UPRN
searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
newest_epc = searcher.newest_epc
older_epcs = searcher.older_epcs
full_sap_epc = searcher.full_sap_epc
# If we have a survey list, we check the penultimate, because the property might have been installed
penultimate_epc = newest_epc
if consider_penultimate_epc:
# We also want to get the penultimate epc
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
if not penultimate_epc:
penultimate_epc = newest_epc
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# We check the conditions for checking the penultimate epc
identified_for_gbis = property_meta["ECO Eligibility"] in ["gbis"]
identified_for_eco4 = property_meta["ECO Eligibility"] in ["eco4"]
subject_to_ciga = property_meta["ECO Eligibility"] in [
"eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"
]
# condition 1 - identified for gbis and not eligible
condition_1 = (identified_for_gbis and not eligibility.gbis_warmfront
and not eligibility.eco4_warmfront["eligible"]
) & consider_penultimate_epc
# condition 2 - identified for eco4 and not eligible
condition_2 = (identified_for_eco4 and not eligibility.eco4_warmfront[
"eligible"]) & consider_penultimate_epc
# successfully identigied gbis
condition_3 = (
identified_for_gbis and (eligibility.gbis_warmfront or eligibility.eco4_warmfront["eligible"])
)
# Nothing identified
condition_4 = (
not identified_for_gbis
and not identified_for_eco4
and not eligibility.gbis_warmfront
and not subject_to_ciga
and not eligibility.eco4_warmfront["eligible"]
)
# Not identified but seemingly eligible for eco4 or gbis
condition_5 = (
not identified_for_gbis and not identified_for_eco4 and (
eligibility.eco4_warmfront["eligible"] or eligibility.gbis_warmfront
)
)
condition_6 = (
subject_to_ciga and not eligibility.eco4_warmfront["eligible"]
)
if condition_1 or condition_2:
# We check the penultimate epc
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# If this is the case, we need to update the older epcs
# We don't update just to make data cleaning easier
if penultimate_epc.get("estimated") is None:
older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]]
elif condition_3 or condition_4 or condition_5 or condition_6:
pass
else:
NotImplementedError("Implement me")
# If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity
# Loft MUST be suitable
cavity_age = None
if (
identified_for_eco4 and not eligibility.eco4_warmfront["eligible"]
):
# We check the age of the cavity and if it's particularly old, we flag it
cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned)
if eligibility.eco4_warmfront["eligible"]:
if eligibility.epc["uprn"] == "":
eligibility.epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
try:
scoring_dictionary = prepare_model_data_row(
property_id=property_meta["asset_list_row_id"],
modelling_epc=eligibility.epc,
cleaned=cleaned,
cleaning_data=cleaning_data,
created_at=created_at,
old_data=older_epcs,
full_sap_epc=full_sap_epc,
photo_supply_lookup=photo_supply_lookup,
floor_area_decile_thresholds=floor_area_decile_thresholds
)
scoring_data.extend(scoring_dictionary)
except Exception as e:
# If we fail, we just keep a record of it
failed_model_rows.append(
property_meta["asset_list_row_id"]
)
results.append(
{
"row_id": property_meta["asset_list_row_id"],
"uprn": eligibility.epc["uprn"],
"is_estimated": searcher.newest_epc.get("estimated") is not None,
"property_type": eligibility.epc["property-type"],
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
"eco4_message": eligibility.eco4_warmfront["message"],
"eco4_strict": eligibility.eco4_warmfront["strict"],
"gbis_eligible": eligibility.gbis_warmfront["eligible"],
"gbis_message": eligibility.gbis_warmfront["message"],
"gbis_strict": eligibility.gbis_warmfront["strict"],
"sap": float(eligibility.epc["current-energy-efficiency"]),
# Property components
"roof": eligibility.roof["clean_description"],
"walls": eligibility.walls["clean_description"],
"cavity_type": eligibility.cavity["type"],
"heating": eligibility.epc["mainheat-description"],
"tenure": eligibility.tenure,
"date_epc": eligibility.epc["lodgement-date"],
"loft_thickness": eligibility.roof["insulation_thickness"],
"cavity_age": cavity_age,
"eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"],
"eligibility_loft_type": eligibility.eco4_warmfront["loft_type"]
}
)
results_df = pd.DataFrame(results)
scoring_df = pd.DataFrame(scoring_data)
results_df["post_install_sap"] = None
results_df["eligibility_classification"] = None
if not scoring_df.empty:
scoring_df = scoring_df.drop(
columns=[
"rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
"carbon_ending"
]
)
model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
model_api.MODEL_PREFIXES = ["sap_change_predictions"]
scoring_df["id"] = scoring_df["id"] + "phase=0"
# We split up the scoring_df and score
predictions = []
to_loop_over = range(0, scoring_df.shape[0], 400)
for chunk in tqdm(to_loop_over, total=len(to_loop_over)):
predictions_dict = model_api.predict_all(
df=scoring_df.iloc[chunk:chunk + 400],
bucket="retrofit-data-dev",
prediction_buckets={
"sap_change_predictions": "retrofit-sap-predictions-dev",
}
)
predictions.append(predictions_dict["sap_change_predictions"])
predictions = pd.concat(predictions)
predictions_size = predictions.shape[0]
predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
results_df[["row_id", "sap"]], how="left", on="row_id"
)
if predictions.shape[0] != predictions_size:
raise ValueError("Predictions size has changed")
predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
results_df = results_df.merge(
predictions[["sap_uplift", "row_id"]],
how="left",
on="row_id"
)
results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
eligibility_assessment = []
for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
# The upgrade requirements are dependent on the current SAP
# If the property is an F or G, it only needs to upgrade to an %
if row["sap"] <= 38:
if row["post_install_sap"] >= 57:
eligibility_classification = "highest confidence"
elif row["post_install_sap"] >= 55:
eligibility_classification = "high confidence"
elif row["post_install_sap"] >= 53:
eligibility_classification = "medium confidence"
else:
eligibility_classification = "unlikely"
else:
if row["post_install_sap"] >= 71:
eligibility_classification = "highest confidence"
elif row["post_install_sap"] >= 69:
eligibility_classification = "high confidence"
elif row["post_install_sap"] >= 67:
eligibility_classification = "medium confidence"
else:
eligibility_classification = "unlikely"
eligibility_assessment.append(
{
"row_id": row["row_id"],
"eligibility_classification": eligibility_classification
}
)
eligibility_assessment = pd.DataFrame(eligibility_assessment)
# Make sure the results haven't changed in size
results_df = results_df.merge(
eligibility_assessment, how="left", on="row_id"
)
if results_df.shape[0] != len(results):
raise ValueError("results has changed size")
# We store the results in S3 as a pickle
save_pickle_to_s3(
data={
"results_df": results_df,
"scoring_df": scoring_df,
"nodata": nodata
},
bucket_name="retrofit-datalake-dev",
s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
)
outputs[ha_name] = {
"results_df": results_df,
"scoring_df": scoring_df,
"nodata": nodata
}
return outputs
def get_col_widths(dataframe):
# Define a maximum width for any column to prevent excessively wide columns
max_allowed_width = 25
# Calculate widths for columns
widths = []
if isinstance(dataframe.columns, pd.MultiIndex):
# For MultiIndex, calculate max width considering the header and data
header_widths = [max(len(str(item)) for item in col) + 2 for col in dataframe.columns.values] # +2 for padding
for i, column in enumerate(dataframe.columns):
max_data_width = max(dataframe[column].astype(str).apply(len).max(), header_widths[i])
widths.append(min(max_data_width, max_allowed_width))
else:
# For non-MultiIndex, calculate width normally
for col in dataframe.columns:
# Calculate the max length of data or column name and limit it
max_length = max(dataframe[col].astype(str).apply(len).max(), len(str(col)) + 2) # +2 for padding
widths.append(min(max_length, max_allowed_width))
return widths
# def analyse_ha_data(outputs, loader):
# """
# The approach we take within this function is the following:
# For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The
# characterisation can be broken down as the following:
# 1) The property has been identified by Warmfront and is eligible for ECO4/GBIS work, under the strictest criteria
# 2) The property has been identified by Warmfront, however it has a full cavity, and therefore would be subject to
# a CIGA check
# 3) The property has been identified by Warmfront, but the EPC shows that the property has more than 100mm loft
# insulation
# 4) The property has been identified by Warmfront, but doesn't look like a property that would likely qualify under
# any cirsumstances, given the available data
#
# Then, for any property that has NOT been identifid by Warmfront, we identify properties that look like they would
# qualify under the strictest criteria, and mark these as potential additional opportunities.
#
# :return:
# """
#
# eco4_rate = 1710
# gbis_rate = 600
# # old_eco4_rate = 1456
# old_gbis_rate = 432
#
# epc_c_threshold = 80
# scheme_map = {
# "ECO4": "ECO4",
# "AFFORDABLE WARMTH": "ECO4",
# "ECO4 A/W": "ECO4",
# "ECO4 GBIS (ECO+)": "GBIS"
# }
#
# ha_analysis_results = []
# total_revenue_results = []
# for ha_name, datasets in outputs.items():
# inputs = [x for k, x in loader.data.items() if k == ha_name][0]
#
# results_df = datasets["results_df"].copy()
#
# analysis_data = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility"]].rename(
# columns={"row_meaning": "asset_identification_status"}
# ).merge(
# results_df,
# how="left",
# right_on="row_id",
# left_on="asset_list_row_id"
# )
#
# analysis_data["is_remaining"] = True
#
# n_sold_eco4 = 0
# n_sold_gbis = 0
# if not inputs["survey_list"].empty:
# # Merge on the survey list and signal everything that is remaining or not (i.e. anything that hasn't had
# # a survey)
# survey_list = inputs["survey_list"].copy()
#
# # TODO: TEMP
# scheme_column = survey_list.columns[0]
# # We clean up the survey list installation or cancelled
# survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower()
# # Remove all punctuation
# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
# r'[^\w\s]', '', regex=True
# )
# # Remove double spaces
# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
# r'\s+', ' ', regex=True
# )
# # Remove trailing spaces
# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip()
#
# # Remap the values in the scheme column
# survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map)
#
# survey_list["installation_status"] = None
# survey_list["installation_status"] = np.where(
# survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]),
# "installed",
# survey_list["installation_status"]
# )
# survey_list["installation_status"] = np.where(
# survey_list["installed_or_cancelled_clean"].isin(["cancelled"]),
# "cancelled",
# survey_list["installation_status"]
# )
# # Find partial installations
# survey_list["installation_status"] = np.where(
# survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"),
# "partially installed",
# survey_list["installation_status"]
# )
# # Find partial cancellations
# # TODO: We might have more indications of partial cancellations
# survey_list["installation_status"] = np.where(
# survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]),
# "partially cancelled",
# survey_list["installation_status"]
# )
#
# # Finally, for other cases, we set the status to "in progress"
# survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress")
#
# # We concatenate the scheme name with the installation status
# survey_list["installation_status"] = (
# survey_list[scheme_column] + " - " + survey_list["installation_status"]
# )
#
# # TODO: END TEMP
#
# survey_list_to_merge = survey_list[["asset_list_row_id", scheme_column]].copy()
# survey_list_to_merge["is_remaining"] = False
# analysis_data = analysis_data.drop(columns="is_remaining").merge(
# survey_list_to_merge,
# how="left", on="asset_list_row_id"
# )
# analysis_data["is_remaining"] = analysis_data["is_remaining"].fillna(True)
#
# n_sold_eco4 = survey_list_to_merge[survey_list_to_merge[scheme_column] == "ECO4"].shape[0]
# n_sold_gbis = survey_list_to_merge[survey_list_to_merge[scheme_column] == "GBIS"].shape[0]
#
# # Take just remaining
# analysis_data = analysis_data[analysis_data["is_remaining"]]
#
# # Also, if the HA has started selling, we remove any that are still subject to ciga
# n_eco4_missed_subject_to_ciga = 0
# if not inputs["survey_list"].empty:
# n_eco4_missed_subject_to_ciga = (analysis_data["ECO Eligibility"] == "eco4 (subject to ciga)").sum()
# analysis_data = analysis_data[analysis_data["ECO Eligibility"] != "eco4 (subject to ciga)"]
#
# ################################################################################################
# # We take the properties that strictly qualified under eco
# ################################################################################################
#
# eco4_identified = analysis_data[analysis_data["ECO Eligibility"] == "eco4"].copy()
# eco4_identified["identification_type"] = None
# eco4_identified["identification_type"] = np.where(
# (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == True),
# "strict",
# eco4_identified["identification_type"]
# )
#
# # For expansive, the property can be no higher than an EPC C
# eco4_identified["identification_type"] = np.where(
# (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False) & (
# eco4_identified["sap"] <= epc_c_threshold
# ),
# "expansive",
# eco4_identified["identification_type"]
# )
# ################################################################################################
# # We take the properties dependent on CIGA
# ################################################################################################
#
# ciga_dependent_identified = analysis_data[
# analysis_data["ECO Eligibility"].isin(
# [
# "eco4 (subject to ciga)",
# "eco4 - passed ciga"
# ]
# )
# ].copy()
#
# # These are properties that show filled cavity
# ciga_dependent_identified["identification_type"] = None
# ciga_dependent_identified["identification_type"] = np.where(
# ciga_dependent_identified["eco4_message"].isin(
# [
# "Perfect suitability",
# "Meets cavity and sap",
# "Fails cavity, meets loft, fails SAP",
# "Meets fabric, fails SAP check",
# "Meets cavity, loft borderline, meets sap",
# ]
# ) & (ciga_dependent_identified["sap"] <= epc_c_threshold),
# "strict",
# ciga_dependent_identified["identification_type"]
# )
#
# ciga_dependent_identified["identification_type"] = np.where(
# ((ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | (
# ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"])
# )) & (
# (ciga_dependent_identified["sap"] <= epc_c_threshold) &
# pd.isnull(ciga_dependent_identified["identification_type"])
# ),
# "expansive",
# ciga_dependent_identified["identification_type"]
# )
#
# ################################################################################################
# # We properties that qualified for gbis
# ################################################################################################
# gbis_identified = analysis_data[analysis_data["ECO Eligibility"] == "gbis"].copy()
# gbis_identified["identification_type"] = None
# gbis_identified["identification_type"] = np.where(
# (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] < 69),
# "strict",
# gbis_identified["identification_type"]
# )
#
# gbis_identified["identification_type"] = np.where(
# (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] <= epc_c_threshold) & (
# pd.isnull(gbis_identified["identification_type"])
# ),
# "expansive",
# gbis_identified["identification_type"]
# )
#
# # Finally, we look at the properties that have not been identified by Warmfront
# not_identified = analysis_data[
# analysis_data["ECO Eligibility"].isin(
# [
# "not eligible"
# ]
# )
# ].copy()
#
# surplus_eco4 = not_identified[
# (not_identified["eco4_eligible"] == True) & (not_identified["eco4_message"].isin(
# ["Perfect suitability", "Meets cavity, loft borderline, meets sap", "Near perfect suitability"]
# ))
# ]
#
# surplus_gbis = not_identified[
# (not_identified["gbis_eligible"] == True) & (
# ~not_identified["asset_list_row_id"].isin(surplus_eco4["asset_list_row_id"].values)
# ) & (not_identified["sap"] < 69) & (
# (not_identified["cavity_type"].isin(["empty", "partial insulation"])) | (
# not_identified["walls"].str.contains("partial", case=False, na=False)
# )
# )
# ]
# surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False]
#
# # Output variables - the data was sent to us in December, but the remaining figures are
# # what was in November
# november_remaining = loader.december_figures[loader.december_figures["HA Name"] == ha_name]
#
# # ECO4
# n_properties_remaining_in_asset_list = inputs["asset_list"].shape[0]
# november_eco4_remaining = max(november_remaining["ECO4 remaining"].values[0], 0)
# november_eco4_sold = november_remaining["No. of Tech surveys complete - Eco 4"].values[0]
# eco4_sales_since_november = n_sold_eco4 - november_eco4_sold
#
# n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0]
# eco4_of_which_identified_strict = (
# eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] +
# ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "strict"].shape[0]
# )
# eco4_of_which_identified_expansive = (
# eco4_identified[eco4_identified["identification_type"] == "expansive"].shape[0] +
# ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "expansive"].shape[0]
# )
# # GBIS
# n_warmfront_identified_gbis = gbis_identified.shape[0]
# november_gbis_remaining = max(november_remaining["GBIS remaining"].values[0], 0)
# november_gbis_sold = november_remaining["No. of Tech surveys complete - GBIS"].values[0]
# gbis_sales_since_november = n_sold_gbis - november_gbis_sold
# gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0]
# gbis_of_which_identified_expansive = \
# gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0]
#
# to_append = {
# ("", "HA Name"): ha_name,
# ("", "# properties in asset list"): n_properties_remaining_in_asset_list,
# ############
# # ECO4
# ############
# ("ECO4", "# remaining November file"): november_eco4_remaining,
# ("ECO4", "# sold in November file"): november_eco4_sold,
# ("ECO4", "# sold (survey list)"): n_sold_eco4,
# ("ECO4", "# that missed CIGA check"): n_eco4_missed_subject_to_ciga,
# ("ECO4", "# Remaining properties (asset list)"): n_warmfront_identified_eco4,
# ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict,
# ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive,
# ("ECO4", "Of which identified by model - total"): (
# eco4_of_which_identified_strict + eco4_of_which_identified_expansive
# ),
# ("ECO4", "Additional properties"): surplus_eco4.shape[0],
# ############
# # GBIS
# ############
# ("GBIS", "# remaining November file"): november_gbis_remaining,
# ("GBIS", "# sold in November file"): november_gbis_sold,
# ("GBIS", "# sold (survey list)"): n_sold_gbis,
# ("GBIS", "# Remaining properties (asset list)"): n_warmfront_identified_gbis,
# ("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict,
# ("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive,
# ("GBIS", "Of which identified by model - total"): (
# gbis_of_which_identified_strict + gbis_of_which_identified_expansive
# ),
# ("GBIS", "Additional properties"): surplus_gbis.shape[0]
# }
#
# ha_analysis_results.append(to_append)
#
# # Calculate the revenue results
# to_append_revenue = {
# ("", "HA Name"): ha_name,
# # Eco4 revenue
# ("ECO4", "£ remaining November file"): november_eco4_remaining * eco4_rate,
# ("ECO4", "£ sold November file"): november_eco4_sold * old_eco4_rate,
# ("ECO4", "£ sold since November"): eco4_sales_since_november * eco4_rate,
# ("ECO4", "£ stuck at ciga check"): n_eco4_missed_subject_to_ciga * eco4_rate,
# ("ECO4", "£ remaining (asset list)"): n_warmfront_identified_eco4 * eco4_rate,
# ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict * eco4_rate,
# ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive * eco4_rate,
# ("ECO4", "Of which identified by model - total"): eco4_rate * (
# eco4_of_which_identified_strict + eco4_of_which_identified_expansive
# ),
# ("ECO4", "Additional properties"): eco4_rate * surplus_eco4.shape[0],
# }
# total_revenue_results.append(to_append_revenue)
#
# ha_analysis_results = pd.DataFrame(ha_analysis_results)
# ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns)
#
# facts_and_figures = loader.facts_and_figures.copy()
# facts_and_figures["ha_number"] = facts_and_figures["HA Name"].str.extract(r'(\d+)').astype(int)
# facts_and_figures = facts_and_figures.sort_values("ha_number")
# facts_and_figures = facts_and_figures.drop(columns=["ha_number"])
#
# # Rename some of the cols
# facts_and_figures = facts_and_figures.rename(
# columns={
# # ECO4 cols
# "ECO4": "ECO4 - November",
# "GBIS": "GBIS - November",
# "eco4 (subject to ciga)": "ECO4 - subject to ciga",
# "eco4": "ECO4 - doesn't need CIGA",
# "eco4 - passed ciga": "ECO4 - passed CIGA",
# "failed ciga": "ECO4 - failed CIGA",
# "ECO4 - partially cancelled": "ECO4 - Install downgrade to GBIS",
# "ECO4 - in progress": "ECO4 - Install in progress",
# "ECO4 - cancelled": "ECO4 - Install cancelled",
# # GBIS cols
# "gbis": "GBIS total (asset list)"
# }
# )
# # We calculate the eco4 total from the asset list
# # 1) If ciga checks have been completed (i.e. ECO4 - passed ciga > 0) this sum is
# # ECO4 - doesn't need CIGA + ECO4 - passed CIGA
# # 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is
# # ECO4 - doesn't need CIGA + ECO4 - subject to ciga
# facts_and_figures["ECO4 total (asset list - pre ciga)"] = (
# facts_and_figures["ECO4 - doesn't need CIGA"] +
# facts_and_figures["ECO4 - subject to ciga"] +
# facts_and_figures["ECO4 - passed CIGA"]
# )
#
# facts_and_figures["ECO4 total (asset list - post ciga)"] = None
# facts_and_figures["ECO4 total (asset list - post ciga)"] = np.where(
# facts_and_figures["ECO4 - passed CIGA"] > 0,
# facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"],
# facts_and_figures["ECO4 total (asset list - post ciga)"]
# )
#
# # Re-arrange the columns
# facts_and_figures = facts_and_figures[
# [
# 'HA Name',
# 'ECO4 - November',
# 'GBIS - November',
# 'ECO4 total (asset list - pre ciga)',
# 'ECO4 total (asset list - post ciga)',
# 'GBIS total (asset list)',
# 'ECO4 - subject to ciga',
# "ECO4 - doesn't need CIGA",
# 'ECO4 - passed CIGA',
# 'ECO4 - failed CIGA',
# 'ECO4 - installed',
# 'ECO4 - Install in progress',
# 'ECO4 - Install cancelled',
# 'ECO4 - partially installed',
# 'ECO4 - Install downgrade to GBIS',
# ]
# ]
# # Addd a note to flag any rows where ECO4 (
# # subject to ciga is greater than 0) and (ECO4 - passed ciga is greater than 0
# # )
# facts_and_figures["Missed CIGA checks opportunity"] = None
# facts_and_figures["Missed CIGA checks opportunity"] = np.where(
# (facts_and_figures["ECO4 - subject to ciga"] > 0) & (facts_and_figures["ECO4 - passed CIGA"] > 0),
# "potential opportunity of " + facts_and_figures["ECO4 - subject to ciga"].astype(
# str) + " ECO4 properties needing a CIGA check",
# facts_and_figures["Missed CIGA checks opportunity"]
# )
#
# facts_and_figures.to_csv("Facts and figures sample.csv")
#
# # Re arrage the columns
#
# # Also sort ha_analysis_results by ha number
# ha_analysis_results["ha_number"] = ha_analysis_results[("", "HA Name")].str.extract(r'(\d+)').astype(int)
# ha_analysis_results = ha_analysis_results.sort_values("ha_number")
# ha_analysis_results = ha_analysis_results.drop(columns=["ha_number"])
#
# # We save 2 sheets
# # Automate creation of the excel
# # Create a Pandas Excel writer using XlsxWriter as the engine
# with pd.ExcelWriter('HA Analysis Results.xlsx', engine='xlsxwriter') as writer:
# # Write each dataframe to a different worksheet without the index
# for df, sheet in [(facts_and_figures, 'HA Facts and Figures'),
# (ha_analysis_results, 'Asset Identification')]:
#
# df.to_excel(writer, sheet_name=sheet)
#
# # Auto-adjust columns' width
# for i, width in enumerate(get_col_widths(df)):
# writer.sheets[sheet].set_column(i, i, width)
#
# # Inspection: - Looking into the proportion of homes with "cavity, as built, insulated (assumed)" as their
# # description, and what proportion of time they get identified via non-invasive surveys
#
# # true_eco4_assets = []
# # ciga_dependent_assets = []
# # not_eligible = []
# # as_built_insulated = []
# # date_cols = {
# # "HA39": "date_built",
# # "HA14": "Built In Year",
# # "HA6": "Construction Year",
# # "HA1": "Build Date",
# # "HA107": "YEAR BUILT"
# # }
# # for ha_name, data_objects in outputs.items():
# # inputs = [x for k, x in loader.data.items() if k == ha_name][0]
# #
# # date_col = date_cols[ha_name]
# # results_df = data_objects["results_df"].copy()
# # df = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility", date_col]].rename(
# # columns={"row_meaning": "asset_identification_status", date_col: "date_built"}
# # ).merge(
# # results_df,
# # how="left",
# # right_on="row_id",
# # left_on="asset_list_row_id"
# # )
# #
# # # take the true ECO4
# # true_eco4 = df[df["ECO Eligibility"] == "eco4"].copy()
# # ciga_dependent = df[
# # df["ECO Eligibility"].isin(
# # [
# # "eco4 (subject to ciga)",
# # "failed ciga",
# # "eco4 - passed ciga"
# # ]
# # )
# # ]
# # insulated_assumed = df[df["walls"] == "Cavity wall, as built, insulated"].copy()
# # # We convert date built to datetime
# # try:
# # insulated_assumed = insulated_assumed[~pd.isnull(insulated_assumed["date_built"])]
# # insulated_assumed["year_built"] = pd.to_datetime(insulated_assumed["date_built"].astype(str)).dt.year
# # as_built_insulated.append(insulated_assumed)
# # except Exception as e:
# # print("oh well")
# #
# # true_eco4_assets.append(true_eco4)
# # ciga_dependent_assets.append(ciga_dependent)
# #
# # true_eco4_assets = pd.concat(true_eco4_assets)
# # ciga_dependent_assets = pd.concat(ciga_dependent_assets)
# # as_built_insulated = pd.concat(as_built_insulated)
# #
# # true_eco4_assets["walls"].value_counts(normalize=True)
# # ciga_dependent_assets["walls"].value_counts(normalize=True)
# #
# # from recommendations.recommendation_utils import extract_insulation_thickness
# #
# # true_eco4_assets["roof_insulation_thickness"] = true_eco4_assets["roof"].apply(
# # lambda x: extract_insulation_thickness(x)
# # )
# #
# # true_eco4_assets["e"] = true_eco4_assets.merge(
# # pd.DataFrame(cleaned["roof-description"])[["original_description", "insulation_thickness"]],
# # how="left",
# # left_on="roof",
# # right_on="original_description"
# # )
# #
# # true_eco4_assets["sap"].mean()
# #
# # true_eco4_assets["insulation_thickness"].isin(
# # ["250", "150", "200", "100", "75", "50"]
# # ).sum() / true_eco4_assets.shape[0]
# #
# # true_eco4_assets["insulation_thickness"].isin(
# # ["100"]
# # ).sum() / true_eco4_assets.shape[0]
# #
# # as_built_insulated.groupby("property_type")["ECO Eligibility"].value_counts(normalize=True)
def get_propensity_model_data(
loader, cleaned, cleaning_data, created_at, photo_supply_lookup,
floor_area_decile_thresholds, pull_data=True
):
# TODO: Set a seed!
model_data = []
for ha_name, data_assets in loader.data.items():
logger.info("Processing HA: %s", ha_name)
if data_assets["survey_list"].empty:
continue
number_sold = data_assets["survey_list"].shape[0]
# For each HA, we read pull in the data required, and store in S3
asset_list = data_assets["asset_list"].copy()
# We determine the number of properties that we should select that are eligible
asset_list_size = asset_list.shape[0]
# Number eligible
n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0]
success_rate = n_eligibile / asset_list_size
needed_sample_size = np.ceil(number_sold / success_rate)
number_negative_samples = int(needed_sample_size - number_sold)
sold_asset_list_ids = data_assets["survey_list"]["asset_list_row_id"].tolist()
negative_sample_asset_list_ids = asset_list["asset_list_row_id"].sample(number_negative_samples).tolist()
sample_ids = sold_asset_list_ids + negative_sample_asset_list_ids
sample_asset_list = asset_list[asset_list["asset_list_row_id"].isin(sample_ids)]
# In order to have the most confidence, we should take just properties that have 1 EPC. We might need to
# cut down the number of properties that we include because of this
# Note: This is an imbalanced problem so we will need to build a model accomadating of that
data = []
errors = []
for index, property_meta in tqdm(sample_asset_list.iterrows(), total=len(sample_asset_list)):
if property_meta["matching_postcode"] is None:
continue
property_type, built_form = get_property_type_and_built_form(
property_meta=property_meta, ha_name=ha_name
)
searcher = SearchEpc(
address1=str(property_meta["HouseNo"]),
postcode=property_meta["matching_postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
full_address=property_meta["matching_address"]
)
searcher.ordnance_survey_client.property_type = property_type
searcher.ordnance_survey_client.built_form = built_form
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
continue
if searcher.newest_epc.get("estimated"):
# We insert the row ID as our proxy for UPRN
searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
newest_epc = searcher.newest_epc
older_epcs = searcher.older_epcs
full_sap_epc = searcher.full_sap_epc
# If we have more than 1 EPC for the moment we just continue
if older_epcs or full_sap_epc:
continue
try:
# We clean up the data
epc_records = {
'original_epc': newest_epc.copy(),
'full_sap_epc': full_sap_epc.copy(),
'old_data': older_epcs.copy(),
}
epc_record = EPCRecord(
epc_records=epc_records,
run_mode="newdata",
cleaning_data=cleaning_data
)
# If we have some data, continue
data.append(
{
"ECO Eligibility": property_meta["ECO Eligibility"],
"asset_list_row_id": property_meta["asset_list_row_id"],
**epc_record.get("prepared_epc")
}
)
except Exception as e:
errors.append(
{
"error": str(e),
"asset_list_row_id": property_meta["asset_list_row_id"],
"matching_postcode": property_meta["matching_postcode"],
"matching_address": property_meta["matching_address"]
}
)
data = pd.DataFrame(data)
# We store the results in S3 as a pickle
save_pickle_to_s3(
data=data,
bucket_name="retrofit-datalake-dev",
s3_file_name=f"propensity_model_data/{ha_name}/train.pickle"
)
# Store the errors
if errors:
save_pickle_to_s3(
data=errors,
bucket_name="retrofit-datalake-dev",
s3_file_name=f"propensity_model_data/{ha_name}/errors.pickle"
)
model_data.append(data)
return model_data
def conversion_model(loader):
# Read in the model data
model_data = []
for ha_name in loader.data.keys():
try:
picked = read_pickle_from_s3(
bucket_name="retrofit-datalake-dev",
s3_file_name=f"propensity_model_data/{ha_name}/train.pickle"
)
data = pd.DataFrame(picked)
# We merge on the sales data
sales_data = loader.data[ha_name]["survey_list"].copy()
data = data.merge(
sales_data[["asset_list_row_id", "installation_status"]],
how="left",
on="asset_list_row_id"
)
data["ha_name"] = ha_name
except Exception as e:
logger.error("Error reading in the data for %s", ha_name)
continue
model_data.append(data)
model_data = pd.concat(model_data)
model_data["response"] = model_data["installation_status"].isin(
[
"ECO4 - in progress",
"ECO4 - installed"
]
).astype(int)
# Because of how we pulled the data, we need to re-balance the sample
ha_names = model_data["ha_name"].unique()
balanced_sample = []
for ha_name in ha_names:
df = model_data[model_data["ha_name"] == ha_name]
positive_samples = df[df["response"] == 1]
negative_samples = df[df["response"] != 1]
inputs = [x for k, x in loader.data.items() if k == ha_name][0]
asset_list = inputs["asset_list"].copy()
asset_list_size = asset_list.shape[0]
n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0]
success_rate = n_eligibile / asset_list_size
needed_sample_size = np.ceil(positive_samples.shape[0] / success_rate)
number_negative_samples = int(needed_sample_size - positive_samples.shape[0])
negative_samples_subset = negative_samples.sample(number_negative_samples)
output = pd.concat([positive_samples, negative_samples_subset])
balanced_sample.append(output)
balanced_sample = pd.concat(balanced_sample)
# We work with a small sample
# Drop the ECO Eligibility column and installation_status column
# We keep the ID column
balanced_sample = balanced_sample.drop(
columns=['ECO Eligibility', 'asset_list_row_id', 'address', 'uprn_source', 'address3', 'local_authority_label',
'county', 'postcode', 'constituency', 'local_authority', 'inspection_date', 'address1',
'constituency_label', 'building_reference_number', 'address2', 'posttown', 'lodgement_datetime',
'uprn', 'lodgement_date', 'lmk_key', 'installation_status', 'ha_name']
)
# POC model
df = balanced_sample.copy()
# FIll missings with means, if they exist
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df[categorical_cols] = df[categorical_cols].fillna("other")
# Reduce the number of categories to a specific number and the rest to other
max_n_categories = 10
for col in categorical_cols:
top_categories = df[col].value_counts().nlargest(max_n_categories).index
df[col] = df[col].where(df[col].isin(top_categories), other="other")
# Use a model based approach to feature selection
import xgboost as xgb
from sklearn.model_selection import train_test_split
# Assuming your outcome column is named 'target'
X = df.drop(columns=['response'])
y = df['response']
df["low_energy_fixed_light_count"].va
# Encoding categorical variables if not already done
X = pd.get_dummies(X, drop_first=True)
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize an XGBoost classifier
model = xgb.XGBClassifier()
# Fit the model
model.fit(X_train, y_train)
# Get feature importances
feature_importances = model.feature_importances_
# Map feature importances to their corresponding column names
feature_importance_dict = {feature: importance for feature, importance in zip(X.columns, feature_importances)}
# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)
# Display sorted features
for feature, importance in sorted_features:
print(f"{feature}: {importance}")
def patch_cleaned(cleaned):
# Patch to handle the a missing description
cleaned["floor-description"].extend(
[
{'original_description': 'To external air, uninsulated (assumed)',
'clean_description': 'To external air, no insulation', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': False,
'is_to_external_air': True, 'is_suspended': False, 'is_solid': False, 'another_property_below': False,
'insulation_thickness': 'none'},
{'original_description': 'To unheated space, uninsulated (assumed)',
'clean_description': 'To unheated space, uninsulated', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': True,
'is_to_external_air': False, 'is_suspended': False, 'is_solid': False, 'another_property_below': False,
'insulation_thickness': 'average'}
]
)
cleaned["roof-description"].extend(
[
{'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation',
'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True,
'is_roof_room': False,
'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'}
]
)
cleaned["roof-description"].extend(
[
{'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation',
'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True,
'is_roof_room': False,
'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'}
]
)
cleaned["roof-description"].extend(
[
{'original_description': 'Pitched, 300+mm loft insulation',
'clean_description': 'Pitched, 300+ mm loft insulation', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_pitched': True, 'is_roof_room': False, 'is_loft': True,
'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '300+'
}
]
)
thermal_transmittance_values = list(np.arange(0, 2, 0.01))
for ttv in thermal_transmittance_values:
ttv_roundeded = round(ttv, 2)
# We look for an instance of that thermal transmittance value
rec = [
x for x in cleaned["roof-description"] if
(x["thermal_transmittance"] == ttv_roundeded) and "Average thermal transmittance" in x["clean_description"]
]
if rec:
continue
else:
# We patch the record
cleaned["roof-description"].extend(
[{'original_description': f'Average thermal transmittance {ttv_roundeded} W/m-¦K',
'clean_description': f'Average thermal transmittance {ttv_roundeded} w/m-¦k',
'thermal_transmittance': ttv_roundeded,
'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False,
'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}]
)
# We also patch a funny unit value we found
for ttv in thermal_transmittance_values:
ttv_rounded = round(ttv, 2)
# We look for an instance of that thermal transmittance value
rec = [
x for x in cleaned["roof-description"] if
(x["thermal_transmittance"] == ttv_rounded) and "Average thermal transmittance" in x["clean_description"]
and x["thermal_transmittance_unit"] == "w/m?K"
]
if rec:
continue
else:
# We patch the record
ttv_string = str(ttv_rounded)
if len(ttv_string) == 3:
ttv_string = f"{ttv_string}0"
cleaned["roof-description"].extend(
[{'original_description': f'Average thermal transmittance {ttv_string} W/m?K',
'clean_description': f'Average thermal transmittance {ttv_string} w/m-¦k',
'thermal_transmittance': ttv_rounded,
'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False,
'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}]
)
# Patch mainheatcont-description
cleaned["mainheatcont-description"].extend(
[
{'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None,
'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None,
'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None,
'rate_control': None}
]
)
# We patch this record because there is another property below
for x in cleaned["floor-description"]:
if x["original_description"] == '(Same dwelling below) insulated (assumed)':
x["another_property_below"] = True
x["thermal_transmittance"] = 0
return cleaned
def calculate_eco4_post_ciga(
eligiblity_counts, input_data, ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate,
eco4_rate, archetype_conversion_rate
):
remaining_needing_ciga_check = eligiblity_counts[
eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") &
~eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype")
]["count"].sum()
remaining_needing_ciga_and_archetype_check = eligiblity_counts[
eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") &
eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype")
]["count"].sum()
# We scale this down by the archetype_conversion_rate, and add this on to the remaining_needing_ciga_check
remaining_needing_ciga_and_archetype_check_passed = np.round(
remaining_needing_ciga_and_archetype_check * archetype_conversion_rate
)
remaining_needing_ciga_check += remaining_needing_ciga_and_archetype_check_passed
eco4_no_ciga_needed = eligiblity_counts[
eligiblity_counts["ECO Eligibility"] == "eco4"
]["count"].sum()
eco4_no_ciga_archetype_needed = eligiblity_counts[
eligiblity_counts["ECO Eligibility"] == "eco4 (subject to archetype)"
]["count"].sum()
eco4_no_ciga_archetype_needed_passed = np.round(
eco4_no_ciga_archetype_needed * archetype_conversion_rate
)
eco4_no_ciga_needed += eco4_no_ciga_archetype_needed_passed
failed_archetype_check = int(
remaining_needing_ciga_and_archetype_check +
eco4_no_ciga_archetype_needed -
remaining_needing_ciga_and_archetype_check_passed -
eco4_no_ciga_archetype_needed_passed
)
has_ciga_check = not input_data["ciga_list"].empty
if has_ciga_check:
eco4_ciga_passed = eligiblity_counts[
eligiblity_counts["ECO Eligibility"] == "eco4 - passed ciga"
]["count"].sum()
eco4_confirmed_ciga_failures = eligiblity_counts[
eligiblity_counts["ECO Eligibility"] == "failed ciga"
]["count"].sum()
eco4_no_ciga_needed_or_ciga_passed = eco4_no_ciga_needed + eco4_ciga_passed
eco4_confirmed = np.round(
(eco4_no_ciga_needed * ha_eco4_to_sale_rate) +
(eco4_ciga_passed * ha_ciga_pass_to_sale_rate)
)
eco4_no_ciga_needed_cancellations = int(eco4_no_ciga_needed_or_ciga_passed - eco4_confirmed)
if remaining_needing_ciga_check > 0:
# We update the eco4 post ciga with the converted remaining
eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate)
eco4_remaining_forecast = np.round(
eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate
)
eco4_ciga_needed_cancellations = eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast
eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass
eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast
else:
eco4_remaining_forecast = 0
eco4_estimated_ciga_failures = 0
eco4_ciga_needed_cancellations = 0
eco4_post_ciga = eco4_confirmed
eco4_expected_cancellations = eco4_no_ciga_needed_cancellations + eco4_ciga_needed_cancellations
else:
eco4_confirmed_ciga_failures = 0
# Multiply by sale conversion
eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate)
eco4_no_ciga_cancellations = int(eco4_no_ciga_needed - eco4_confirmed)
eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate)
eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass
eco4_remaining_forecast = np.round(
eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate
)
eco4_ciga_cancellations = int(eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast)
eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast
eco4_expected_cancellations = eco4_no_ciga_cancellations + eco4_ciga_cancellations
eco4_post_ciga = int(eco4_post_ciga)
eco4_remaining_forecast = int(eco4_remaining_forecast)
eco4_confirmed = int(eco4_confirmed)
results = {
# Counts
"ECO4 - post CIGA - #": eco4_post_ciga,
"Of which confirmed - #": eco4_confirmed,
"Of which forecast - #": eco4_remaining_forecast,
# Revenue
"ECO4 - post CIGA - £": eco4_post_ciga * eco4_rate,
"Of which confirmed - £": eco4_confirmed * eco4_rate,
"Of which forecast - £": eco4_remaining_forecast * eco4_rate,
# Archetype check failures
"Estimated total - failed archetype check - #": failed_archetype_check,
"Estimated total - failed archetype check - £": failed_archetype_check * eco4_rate,
# Ciga failures
"Estimated total - failed CIGA": int(eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures),
"Confirmed CIGA failures": eco4_confirmed_ciga_failures,
"Estimated CIGA failures": int(eco4_estimated_ciga_failures),
# Ciga failures cost
"Estimated total - failed CIGA - £": int(
(eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures) * eco4_rate
),
"Confirmed CIGA failures - £": int(eco4_confirmed_ciga_failures * eco4_rate),
"Estimated CIGA failures - £": int(eco4_estimated_ciga_failures * eco4_rate),
# Expected cencellations
"Expected cancellations - #": eco4_expected_cancellations,
"Expected cancellations - £": eco4_expected_cancellations * eco4_rate
}
return results
def forecast_remaining_sales(loader):
# Assumptions:
# We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate
# and I don't want the numbers to change too much, depenent on the CIGA conversation rate
maximum_ciga_conversion = 0.75
# This is a hard limit to the allowed conversion rates to final sale. These are typically very
# high but there are some anomalies, amongst surveys that are early on
sales_conversion_lower_bound = 0.8
gbis_rate = 600
eco4_rate = 1710
# Based on ONS https://www.ons.gov.uk/peoplepopulationandcommunity/housing/bulletins/housingenglandandwales
# /census2021
# there are 5.7 million terraced properties in the UK, of the 19.3 million houses or bungalows. We therefore apply
# a 30% discount to homes that are dependent on an archetype check, since around 30% of them will be mid terraced
# This 30% is slightly harsh but we be conservative
# Therefore, the archetype check conversion rate is 70%
archetype_conversion_rate = 0.7
# 1) Calculate the conversion rate from passed CIGA to actual sale
converted_ciga_jobs = []
for ha_name, input_data in loader.data.items():
asset_list = input_data["asset_list"].copy()
survey_list = input_data["survey_list"].copy()
if survey_list.empty:
continue
ciga_dependent_assets = asset_list[
asset_list["ECO Eligibility"] == "eco4 - passed ciga"
]
# These are now the ciga dependent assets at installation
ciga_dependent_assets_at_installation = ciga_dependent_assets.merge(
survey_list[["asset_list_row_id", "installation_status"]],
how="inner",
on="asset_list_row_id"
)
# We then calculate how many get cancelled
ciga_dependent_assets_sold = ciga_dependent_assets_at_installation[
ciga_dependent_assets_at_installation["installation_status"].isin(
[
"ECO4 - installed", "ECO4 - in progress"
]
)
]
ciga_dependent_assets_failed = ciga_dependent_assets_at_installation[
~ciga_dependent_assets_at_installation["installation_status"].isin(
[
"ECO4 - installed", "ECO4 - in progress"
]
)
]
converted_ciga_jobs.append(
{
"HA Name": ha_name,
"# Ciga dependent at installation": ciga_dependent_assets_at_installation.shape[0],
"# Ciga dependent successfully installed": ciga_dependent_assets_sold.shape[0],
"# Ciga dependent failed install": ciga_dependent_assets_failed.shape[0]
}
)
converted_ciga_jobs = pd.DataFrame(converted_ciga_jobs)
# We calculate a ciga pass to install conversaion rate
median_ciga_pass_to_install = (
converted_ciga_jobs["# Ciga dependent successfully installed"].sum() /
converted_ciga_jobs["# Ciga dependent at installation"].sum()
)
# 2) Calculate the conversion rate from CIGA dependent to ciga passed
ciga_passrates = []
for ha_name, input_data in loader.data.items():
# If we don't have a ciga list, we can't do anything
if input_data["ciga_list"].empty:
continue
# 1) Calculate the conversion rate for CIGA to actual sale
asset_list = input_data["asset_list"].copy()
ciga_completed_assets = asset_list[
asset_list["ECO Eligibility"].isin(
[
"eco4 - passed ciga",
"failed ciga"
]
)
]
ciga_passed = ciga_completed_assets[
ciga_completed_assets["ECO Eligibility"].isin(
[
"eco4 - passed ciga"
]
)
]
ciga_passrates.append(
{
"Ha Name": ha_name,
"# CIGA dependent": ciga_completed_assets.shape[0],
"# CIGA passed": ciga_passed.shape[0],
}
)
ciga_passrates = pd.DataFrame(ciga_passrates)
median_ciga_success_rate = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum()
# 3) Calculate the conversion rate of an ECO4 and a GBISjob, that doesn't need ciga, to install
eco4_ciga_independent_to_install = []
gbis_to_install = []
for ha_name, input_data in loader.data.items():
asset_list = input_data["asset_list"].copy()
survey_list = input_data["survey_list"].copy()
if survey_list.empty:
continue
# For properties that were identified as a typical ECO4 job, we calculate the number of properties that
# installed
# vs cancelled
typical_eco4 = asset_list[asset_list["ECO Eligibility"] == "eco4"]
typical_gbis = asset_list[asset_list["ECO Eligibility"] == "gbis"]
# Merge on the surveys
typical_eco4_installed = typical_eco4.merge(
survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id"
)
if not typical_eco4_installed.empty:
typical_eco4_sold = typical_eco4_installed[
typical_eco4_installed["installation_status"].isin(
[
"ECO4 - installed", "ECO4 - in progress"
]
)
]
eco4_ciga_independent_to_install.append(
{
"Ha Name": ha_name,
"# ECO4 at install stage": typical_eco4_installed.shape[0],
"# ECO4 successfully installed": typical_eco4_sold.shape[0]
}
)
typical_gbis_installed = typical_gbis.merge(
survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id"
)
if not typical_gbis_installed.empty:
typical_gbis_sold = typical_gbis_installed[
typical_gbis_installed["installation_status"].isin(
[
"GBIS - in progress", "GBIS - installed"
]
)
]
gbis_to_install.append(
{
"Ha Name": ha_name,
"# GBIS at install stage": typical_gbis_installed.shape[0],
"# GBIS successfully installed": typical_gbis_sold.shape[0]
}
)
eco4_ciga_independent_to_install = pd.DataFrame(eco4_ciga_independent_to_install)
gbis_to_install = pd.DataFrame(gbis_to_install)
eco4_ciga_independent_to_install["conversion"] = (
eco4_ciga_independent_to_install["# ECO4 successfully installed"] /
eco4_ciga_independent_to_install["# ECO4 at install stage"]
)
eco4_ciga_independent_to_install_clipped = eco4_ciga_independent_to_install[
eco4_ciga_independent_to_install["conversion"] >= sales_conversion_lower_bound
]
gbis_to_install["conversion"] = (
gbis_to_install["# GBIS successfully installed"] /
gbis_to_install["# GBIS at install stage"]
)
gbis_to_install_clipped = gbis_to_install[
gbis_to_install["conversion"] >= sales_conversion_lower_bound
]
median_eco4_to_install = (
eco4_ciga_independent_to_install_clipped["# ECO4 successfully installed"].sum() /
eco4_ciga_independent_to_install_clipped["# ECO4 at install stage"].sum()
)
median_gbis_to_install = (
gbis_to_install_clipped["# GBIS successfully installed"].sum() /
gbis_to_install_clipped["# GBIS at install stage"].sum()
)
# Produce the final output
december_figures = loader.december_figures.copy()
december_figures = december_figures.fillna(0)
# If we have negative remaining, it means that actually sold more gbis than they initially thought so we set
# remaining to 0
december_figures["ECO4 remaining"] = np.where(
december_figures["ECO4 remaining"] < 0, 0, december_figures["ECO4 remaining"]
)
december_figures["GBIS remaining"] = np.where(
december_figures["GBIS remaining"] < 0, 0, december_figures["GBIS remaining"]
)
results = []
for ha_name, input_data in loader.data.items():
# Original warmfront figures - ECO4
original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name]
if original_warmfront_estimates.empty:
# Append an empty row
original_warmfront_estimates = december_figures.head(1).copy()
for k in original_warmfront_estimates.columns:
original_warmfront_estimates[k] = 0
original_warmfront_estimates["HA Name"] = ha_name
original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0]
original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0]
original_warmfront_sold_eco4 = (
original_warmfront_estimates["No. of Tech surveys complete - Eco 4"].values[0] * eco4_rate
)
original_warmfront_eco4_revenue = original_warmfront_eco4 * eco4_rate
original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate
original_warmfront_sold_gbis = (
original_warmfront_estimates["No. of Tech surveys complete - GBIS"].values[0] * gbis_rate
)
# Original warmfront figures - GBIS
original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0]
original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0]
original_warmfront_gbis_revenue = (
original_warmfront_gbis * gbis_rate
)
original_warmfront_remaining_gbis_revenue = original_warmfront_remaining_gbis * gbis_rate
# Asset list - ECO4
asset_list = input_data["asset_list"].copy()
survey_list = input_data["survey_list"].copy()
if survey_list.empty:
asset_list_remaining = asset_list.copy()
else:
# For HA6, there are a small number of postcodes that do not match to any item in the asset list
survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])]
asset_list_remaining = asset_list.merge(
survey_list[["asset_list_row_id", "installation_status"]],
how="left",
on="asset_list_row_id"
)
# Anything that has an installation has gone to installation, and therefore is not remaining
asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])]
asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"])
eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index()
eligiblity_counts_remaining = pd.DataFrame(asset_list_remaining["ECO Eligibility"].value_counts()).reset_index()
eco4_pre_ciga = eligiblity_counts[
eligiblity_counts["ECO Eligibility"].isin(
[
"eco4",
"eco4 (subject to ciga)",
"eco4 - passed ciga",
"failed ciga",
"eco4 (subject to ciga) (subject to archetype)",
"eco4 (subject to archetype)"
]
)
]["count"].sum()
eco4_pre_ciga_remaining = eligiblity_counts_remaining[
eligiblity_counts_remaining["ECO Eligibility"].isin(
[
"eco4",
"eco4 (subject to ciga)",
"eco4 - passed ciga",
"failed ciga",
"eco4 (subject to ciga) (subject to archetype)",
"eco4 (subject to archetype)"
]
)
]["count"].sum()
eco4_pre_ciga_revenue = eco4_pre_ciga * eco4_rate
eco4_pre_ciga_remaining_revenue = eco4_pre_ciga_remaining * eco4_rate
# Total Eligible - this is what passed ciga checks + strict. If we don't have what passed CIGA, we estimate
# We check if the HA has done a CIGA check. Also, if we have assets dormant at CIGA, we estimate what will
# convert
# We estimate a conversion for anything left post CIGA
ha_ciga_conversion = ciga_passrates[ciga_passrates["Ha Name"] == ha_name]
if not ha_ciga_conversion.empty:
ha_ciga_conversion_rate = (
ha_ciga_conversion["# CIGA passed"].values[0] / ha_ciga_conversion["# CIGA dependent"].values[0]
)
else:
ha_ciga_conversion_rate = (
median_ciga_success_rate if median_ciga_success_rate <= maximum_ciga_conversion else
maximum_ciga_conversion
)
# We also need the ha ciga passed to install success rate
ha_ciga_pass_to_sale = converted_ciga_jobs[converted_ciga_jobs["HA Name"] == ha_name]
if not ha_ciga_pass_to_sale.empty and ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0] != 0:
ha_ciga_pass_to_sale_rate = (
ha_ciga_pass_to_sale["# Ciga dependent successfully installed"].values[0] /
ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0]
)
else:
ha_ciga_pass_to_sale_rate = median_ciga_pass_to_install
ha_eco4_to_sale = eco4_ciga_independent_to_install_clipped[
eco4_ciga_independent_to_install_clipped["Ha Name"] == ha_name
]
if not ha_eco4_to_sale.empty:
ha_eco4_to_sale_rate = (
ha_eco4_to_sale['# ECO4 successfully installed'].values[0] /
ha_eco4_to_sale['# ECO4 at install stage'].values[0]
)
else:
ha_eco4_to_sale_rate = median_eco4_to_install
eco4_post_ciga_total_results = calculate_eco4_post_ciga(
eligiblity_counts=eligiblity_counts,
input_data=input_data,
ha_ciga_conversion_rate=ha_ciga_conversion_rate,
ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate,
ha_eco4_to_sale_rate=ha_eco4_to_sale_rate,
eco4_rate=eco4_rate,
archetype_conversion_rate=archetype_conversion_rate
)
eco4_post_ciga_remaining_results = calculate_eco4_post_ciga(
eligiblity_counts=eligiblity_counts_remaining,
input_data=input_data,
ha_ciga_conversion_rate=ha_ciga_conversion_rate,
ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate,
ha_eco4_to_sale_rate=ha_eco4_to_sale_rate,
eco4_rate=eco4_rate,
archetype_conversion_rate=archetype_conversion_rate
)
# Calculate the delta compared to Warmfront's original remaining
if original_warmfront_remaining_eco4 == 0:
eco4_delta_vs_original_estimate_remaining = "N/A"
else:
eco4_delta_vs_original_estimate_remaining = ((eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] -
original_warmfront_remaining_eco4) /
original_warmfront_remaining_eco4)
# GBIS Figures
# Estimate the GBIS conversion rate
ha_gbis_sale_conversion = gbis_to_install_clipped[
gbis_to_install_clipped["Ha Name"] == ha_name
]
if not ha_gbis_sale_conversion.empty:
ha_gbis_sale_conversion = (
ha_gbis_sale_conversion["# GBIS successfully installed"].values[0] /
ha_gbis_sale_conversion["# GBIS at install stage"].values[0]
)
else:
ha_gbis_sale_conversion = median_gbis_to_install
gbis_total_pre_cancellations = eligiblity_counts[
eligiblity_counts["ECO Eligibility"] == "gbis"
]["count"].sum()
gbis_total_pre_cancellations_revenue = gbis_total_pre_cancellations * gbis_rate
# gbis_total = int(np.round(gbis_total_pre_cancellations * ha_gbis_sale_conversion))
# gbis_total_revenue = int(gbis_total * gbis_rate)
gbis_remaining_pre_cancellations = eligiblity_counts_remaining[
eligiblity_counts_remaining["ECO Eligibility"] == "gbis"
]["count"].sum()
gbis_remaining_pre_cancellations_revenue = (
gbis_remaining_pre_cancellations * gbis_rate
)
# This is the gbis jobs we expect to sell
gbis_remaining = int(np.round(gbis_remaining_pre_cancellations * ha_gbis_sale_conversion))
gbis_remaining_revenue = int(gbis_remaining * gbis_rate)
# This is the number we expect to cancel
gbis_remaining_expected_cancellations = int(gbis_remaining_pre_cancellations - gbis_remaining)
gbis_remaining_expected_cancellations_revenue = gbis_remaining_expected_cancellations * gbis_rate
# GBIS delta
if original_warmfront_remaining_gbis == 0:
gbis_delta_vs_original_estimate_remaining = "N/A"
else:
gbis_delta_vs_original_estimate_remaining = (
(gbis_remaining - original_warmfront_remaining_gbis) / original_warmfront_remaining_gbis
)
# Current sales figures
# For any sales surveys that are complete, that could still cancel, we apply a conversion rate
eco4_actually_sold = 0
eco4_confirmed_cancellations = 0
eco4_expected_cancellations = 0
gbis_actually_sold = 0
gbis_confirmed_cancellations = 0
gbis_expected_cancellations = 0
if not survey_list.empty:
surveys_with_eligibility = survey_list.merge(
asset_list[["asset_list_row_id", "ECO Eligibility"]],
how="left", on="asset_list_row_id"
)
completed_eco4_sales = surveys_with_eligibility[
surveys_with_eligibility["installation_status"] == "ECO4 - installed"
].shape[0]
incomplete_eco4_sales = surveys_with_eligibility[
(surveys_with_eligibility["installation_status"] == "ECO4 - in progress") &
(~surveys_with_eligibility["ECO Eligibility"].isin(
["eco4 - passed ciga"])
)
].shape[0]
incomplete_eco4_sales_ciga = surveys_with_eligibility[
(surveys_with_eligibility["installation_status"] == "ECO4 - in progress") &
(surveys_with_eligibility["ECO Eligibility"].isin(
["eco4 - passed ciga"])
)
].shape[0]
eco4_confirmed_cancellations = surveys_with_eligibility[
surveys_with_eligibility["installation_status"] == "ECO4 - cancelled"
].shape[0]
expected_eco4_sales_no_ciga = np.round(incomplete_eco4_sales * ha_eco4_to_sale_rate)
expected_eco4_sales_ciga = np.round(incomplete_eco4_sales_ciga * ha_ciga_pass_to_sale_rate)
eco4_expected_cancellations = (incomplete_eco4_sales + incomplete_eco4_sales_ciga) - (
expected_eco4_sales_no_ciga + expected_eco4_sales_ciga
)
eco4_expected_cancellations = int(np.round(eco4_expected_cancellations))
eco4_actually_sold = eco4_rate * (
completed_eco4_sales + expected_eco4_sales_no_ciga + expected_eco4_sales_ciga
)
completed_gbis_sales = surveys_with_eligibility[
surveys_with_eligibility["installation_status"] == "GBIS - installed"
].shape[0]
incomplete_gbis_sales = surveys_with_eligibility[
(surveys_with_eligibility["installation_status"] == "GBIS - in progress")
].shape[0]
# Get confirmed cancellations
gbis_confirmed_cancellations = surveys_with_eligibility[
surveys_with_eligibility["installation_status"] == "GBIS - cancelled"
].shape[0]
expected_gbis_unconfirmed_sales = np.round(incomplete_gbis_sales * ha_gbis_sale_conversion)
gbis_expected_cancellations = int(incomplete_gbis_sales - expected_gbis_unconfirmed_sales)
gbis_actually_sold = completed_gbis_sales * gbis_rate + (
expected_gbis_unconfirmed_sales * gbis_rate
)
# Add in the variance:
# We should expect that the pre-ciga total is:
# 1) The number of post CIGA successes +
# 2) The number of archetype failures +
# 2) the number of CIGA failures +
# 3) The number of cancellations
variance_total = eco4_pre_ciga - (
eco4_post_ciga_total_results["ECO4 - post CIGA - #"] +
eco4_post_ciga_total_results["Estimated total - failed archetype check - #"] +
eco4_post_ciga_total_results['Estimated total - failed CIGA'] +
eco4_post_ciga_total_results["Expected cancellations - #"]
)
if variance_total != 0:
raise ValueError("Something went wrong in variance total")
variance_remaining = eco4_pre_ciga_remaining - (
eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] +
eco4_post_ciga_remaining_results["Estimated total - failed archetype check - #"] +
eco4_post_ciga_remaining_results['Estimated total - failed CIGA'] +
eco4_post_ciga_remaining_results["Expected cancellations - #"]
)
if variance_remaining != 0:
raise ValueError("Something went wrong in variance remaining")
# We also check variances to make sure that the pre-CIGA ECO4 total equals
# 1) Pre CIGA remaining +
# 2) ECO4 sold +
# 3) ECO4 confirmed cancellations +
# 4) ECO4 unconfirmed cancellations
pre_ciga_eco4_variance = (
eco4_pre_ciga_revenue -
eco4_pre_ciga_remaining_revenue -
eco4_actually_sold -
eco4_confirmed_cancellations * eco4_rate -
eco4_expected_cancellations * eco4_rate
)
if pre_ciga_eco4_variance != 0:
raise ValueError("Something went wrong in pre_ciga_eco4_variance")
# Check GBIS total variance
# The total before cancellations should equal:
# The number of sold +
# The number of confirmed cancelled +
# The number of expected cancelled +
# The number of remaining
gbis_variance = gbis_total_pre_cancellations - (
gbis_actually_sold / gbis_rate +
gbis_confirmed_cancellations +
gbis_expected_cancellations +
gbis_remaining_pre_cancellations
)
if gbis_variance != 0:
raise ValueError("Something went wrong in gbis_variance")
# We expect the remaining to equal expected sales + expected cancellations
gbis_variance_2 = gbis_remaining_pre_cancellations - (
gbis_remaining +
gbis_remaining_expected_cancellations
)
if gbis_variance_2 != 0:
raise ValueError("Something went wrong in gbis_variance2")
# Update the GBIS sold, since Warmfront often sold more GBIS that expected
original_warmfront_gbis_revenue = original_warmfront_sold_gbis + original_warmfront_remaining_gbis_revenue
original_warmfront_gbis = (
original_warmfront_sold_gbis / gbis_rate + original_warmfront_remaining_gbis_revenue / gbis_rate
)
to_append = {
("", "", "", "HA Name"): ha_name,
# ECO4 - original warmfront figures
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4,
("ECO4 original", "", "Remaining - #", ""): original_warmfront_remaining_eco4,
("ECO4 original", "", "Total - £", ""): original_warmfront_eco4_revenue,
("ECO4 original", "", "Sold or cancelled - £", ""): original_warmfront_sold_eco4,
("ECO4 original", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue,
# GBIS - original warmfront figures
("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis,
("GBIS original", "", "Remaining - #", ""): original_warmfront_gbis,
("GBIS original", "", "Total - £", ""): original_warmfront_gbis_revenue,
("GBIS original", "", "Sold or cancelled - £", ""): original_warmfront_sold_gbis,
("GBIS original", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue,
# ECO4 - asset list, pre-ciga
("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga,
("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining,
("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue,
("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue,
("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL", ""): pre_ciga_eco4_variance,
("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL VS ELIGIBLE & INELIGIBLE", ""): variance_total,
("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 REMAINING VS ELIGIBLE & INELIGIBLE", ""):
variance_remaining,
("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold,
("ECO4 pre-ciga", "", "Confirmed cancellations - £", ""): eco4_confirmed_cancellations * eco4_rate,
# This is for jobs that are in-progress and could still cancel
("ECO4 pre-ciga", "", "Unconfirmed cancellations - £", ""): eco4_expected_cancellations * eco4_rate,
# ECO4 - asset list, post ciga, total
("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total"):
eco4_post_ciga_total_results[
"ECO4 - post CIGA - #"],
("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_total_results[
"ECO4 - post CIGA - £"],
# ECO4 - asset list, post ciga, remaining
("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining_results[
"ECO4 - post CIGA - #"],
("ECO4 post-ciga", "", "Estimated remaining eligible - £", ""): eco4_post_ciga_remaining_results[
"ECO4 - post CIGA - £"],
("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %",
""): eco4_delta_vs_original_estimate_remaining,
("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - #", ""):
eco4_post_ciga_remaining_results["Of which confirmed - #"],
("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - £", ""):
eco4_post_ciga_remaining_results["Of which confirmed - £"],
("ECO4 post-ciga", "", "Of which forecast - #", ""):
eco4_post_ciga_remaining_results["Of which forecast - #"],
("ECO4 post-ciga", "", "Of which forecast - £", ""):
eco4_post_ciga_remaining_results["Of which forecast - £"],
# Expected ECO4 cancellations
("ECO4 Cancellations", "", "Of which expected cancellations - #", ""): eco4_post_ciga_remaining_results[
"Expected cancellations - #"
],
("ECO4 Cancellations", "", "Of which expected cancellations - £", ""): eco4_post_ciga_remaining_results[
"Expected cancellations - £"
],
# Archetype check failures
("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - #", ""):
eco4_post_ciga_remaining_results['Estimated total - failed archetype check - #'],
("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - £", ""):
eco4_post_ciga_remaining_results['Estimated total - failed archetype check - £'],
# CIGA failures
("ECO4 CIGA failures", "", "Estimated total - failed CIGA - #", ""): eco4_post_ciga_remaining_results[
'Estimated total - failed CIGA'
],
("ECO4 CIGA failures", "", "Estimated total - failed CIGA - £", ""): eco4_post_ciga_remaining_results[
'Estimated total - failed CIGA - £'
],
("ECO4 CIGA failures", "", "Confirmed failures - #", ""): eco4_post_ciga_remaining_results[
"Confirmed CIGA failures"
],
("ECO4 CIGA failures", "", "Confirmed failures - £", ""): eco4_post_ciga_remaining_results[
"Confirmed CIGA failures - £"
],
("ECO4 CIGA failures", "", "Estimated failures - #", ""): eco4_post_ciga_remaining_results[
"Estimated CIGA failures"
],
("ECO4 CIGA failures", "", "Estimated failures - £", ""): eco4_post_ciga_remaining_results[
"Estimated CIGA failures - £"
],
# GBIS postcode list
("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total_pre_cancellations,
("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"):
gbis_total_pre_cancellations_revenue,
("GBIS Postcode list", "Warmfront post code list", "GBIS VARIANCE", "GBIS total"): gbis_variance,
("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold,
("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations * gbis_rate,
# This is for jobs that are in-progress and could still cancel
("GBIS Postcode list", "", "Unconfirmed cancellations - £", ""): gbis_expected_cancellations * gbis_rate,
("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"):
gbis_remaining_pre_cancellations,
("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"):
gbis_remaining_pre_cancellations_revenue,
("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", ""):
gbis_delta_vs_original_estimate_remaining,
# Expected cancellations
(
"GBIS Postcode list", "", "Of which expected sales - £ - £",
"GBIS total"): gbis_remaining_revenue,
("GBIS Postcode list", "", "Of which expected cancellations -£", "GBIS total"):
gbis_remaining_expected_cancellations_revenue
}
# Make sure nothing is forgotten due to duplicate multi-index keys
if len(to_append) != 51:
raise ValueError("Something went wrong")
results.append(to_append)
results = pd.DataFrame(results)
results.to_csv("pipeline_remaining_raw.csv")
totals_row = {}
for col in results.columns:
if col == ('', '', '', 'HA Name'):
totals_row[col] = "Total"
elif col in [
("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", ""),
("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "")
]:
totals_row[col] = None
else:
totals_row[col] = results[col].sum()
# For the delta columns, we calculate the delta on the totals
totals_row[("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", "")] = (
(
totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")] -
totals_row[("ECO4 original", "", "Remaining - #", "")]
) / totals_row[("ECO4 original", "", "Remaining - #", "")]
)
totals_row[("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "")] = (
(
totals_row[("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total")] -
totals_row[("GBIS original", "", "Remaining - #", "")]
) / totals_row[("GBIS original", "", "Remaining - #", "")]
)
blank_row = pd.DataFrame([{col: "" for col in results.columns}])
# Put together a Warmfront original remaining ECO4 vs asset list remaining ECO4 and same for GBIS, as well as totals
# ECO4 Headlines
headline_eco4_original_remaining = totals_row[("ECO4 original", "", "Remaining - #", "")]
headline_eco4_original_remaining_revenue = totals_row[("ECO4 original", "", "Remaining - £", "")]
headline_eco4_postcode_list_remaining = totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")]
headline_eco4_postcode_list_remaining_revenue = totals_row[
("ECO4 post-ciga", "", "Estimated remaining eligible - £", "")
]
headline_eco4_delta = 100 * (
(headline_eco4_postcode_list_remaining - headline_eco4_original_remaining) /
headline_eco4_original_remaining
)
headline_eco4_delta = round(headline_eco4_delta, 1)
# GBIS Headlines
headline_gbis_original_remaining = totals_row[("GBIS original", "", "Remaining - #", "")]
headline_gbis_original_remaining_revenue = totals_row[("GBIS original", "", "Remaining - £", "")]
headline_gbis_postcode_list_remaining = totals_row[
("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total")
]
headline_gbis_postcode_list_remaining_revenue = totals_row[
("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total")
]
headline_gbis_delta = 100 * (
(headline_gbis_postcode_list_remaining - headline_gbis_original_remaining) /
headline_gbis_original_remaining
)
headline_gbis_delta = round(headline_gbis_delta, 1)
headline_original_total_revenue_remaining = (
headline_eco4_original_remaining_revenue + headline_gbis_original_remaining_revenue
)
headline_postcode_list_total_revenue_remaining = (
headline_eco4_postcode_list_remaining_revenue + headline_gbis_postcode_list_remaining_revenue
)
headline_total_delta = 100 * (
(headline_postcode_list_total_revenue_remaining - headline_original_total_revenue_remaining) /
headline_original_total_revenue_remaining
)
headline_total_delta = round(headline_total_delta, 1)
headline_eco4_sold_since_november = (
totals_row[('ECO4 pre-ciga', '', 'Sold - £', '')] +
totals_row[('ECO4 pre-ciga', '', 'Confirmed cancellations - £', '')] + # confirmed canclleations
totals_row[('ECO4 pre-ciga', '', 'Unconfirmed cancellations - £', '')] - # expected cancellations
totals_row[('ECO4 original', '', 'Sold or cancelled - £', '')]
)
headline_gbis_sold_since_november = (
totals_row[("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total")] +
totals_row[("GBIS Postcode list", "", "Confirmed cancellations - £", "")] + # confirmed cancellations
totals_row[("GBIS Postcode list", "", "Unconfirmed cancellations - £", "")] - # expected cancellations
totals_row[('GBIS original', '', 'Sold or cancelled - £', '')]
)
headlines = [
{
("", "", "", "HA Name"): "Headlines",
},
{
("", "", "", "HA Name"): "ECO4 Remaining - November - #",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_original_remaining
},
{
("", "", "", "HA Name"): "ECO4 Remaining - November - £",
(
"", "Original Warmfront estimate", "Total - #",
"ECO4 - November"): headline_eco4_original_remaining_revenue
},
{
("", "", "", "HA Name"): "ECO4 Sold or cancelled since November - £",
(
"", "Original Warmfront estimate", "Total - #",
"ECO4 - November"): headline_eco4_sold_since_november
},
{
("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - #",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_postcode_list_remaining
},
{
("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - £",
("", "Original Warmfront estimate", "Total - #",
"ECO4 - November"): headline_eco4_postcode_list_remaining_revenue
},
{
("", "", "", "HA Name"): "ECO4 £ remaining delta - %",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_eco4_delta) + "%"
},
{
("", "", "", "HA Name"): "GBIS Remaining - November - #",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_original_remaining
},
{
("", "", "", "HA Name"): "GBIS Remaining - November - £",
(
"", "Original Warmfront estimate", "Total - #",
"ECO4 - November"): headline_gbis_original_remaining_revenue
},
{
("", "", "", "HA Name"): "GBIS Sold or cancelled since November - £",
(
"", "Original Warmfront estimate", "Total - #",
"ECO4 - November"): headline_gbis_sold_since_november
},
{
("", "", "", "HA Name"): "GBIS Remaining - post code list - #",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_postcode_list_remaining
},
{
("", "", "", "HA Name"): "GBIS Remaining - post code list - £",
("", "Original Warmfront estimate", "Total - #",
"ECO4 - November"): headline_gbis_postcode_list_remaining_revenue
},
{
("", "", "", "HA Name"): "GBIS delta %",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_gbis_delta) + "%"
},
# Total revenue
{
("", "", "", "HA Name"): "Total Remaining - November - £",
("", "Original Warmfront estimate", "Total - #",
"ECO4 - November"): headline_original_total_revenue_remaining
},
{
("", "", "", "HA Name"): "Total Remaining - post code list (post CIGA) - £",
("", "Original Warmfront estimate", "Total - #",
"ECO4 - November"): headline_postcode_list_total_revenue_remaining
},
{
("", "", "", "HA Name"): "Total Remaining delta %",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_total_delta) + "%"
},
]
assumptions = [
{
("", "", "", "HA Name"): "Assumptions",
},
{
("", "", "", "HA Name"): "ECO4 rate",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(eco4_rate)
},
{
("", "", "", "HA Name"): "GBIS rate",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(gbis_rate)
},
{
("", "", "", "HA Name"): "Median CIGA pass rate",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
round(median_ciga_success_rate * 100, 1)) + "%",
},
{
("", "", "", "HA Name"): "Maximum allowed CIGA pass rate",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
round(maximum_ciga_conversion * 100, 1)) + "%",
("ECO4 original", "", "Remaining - #",
""): "- Maximum allowed CIGA conversion for HAs without CIGA checks We do not allow above this to be "
"conservative"
},
{
("", "", "", "HA Name"): "Median ECO4 (no CIGA) sales conversion rate",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
round(median_eco4_to_install * 100, 1)) + "%",
("ECO4 original", "", "Remaining - #",
""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check. Surveys that resulted "
"in cancelled install are excluded."
},
{
("", "", "", "HA Name"): "Median ECO4 (subect to CIGA) sales conversion rate",
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
round(median_ciga_pass_to_install * 100, 1)) + "%",
("ECO4 original", "", "Remaining - #",
""): " - Sales conversion rate for a ECO4 property that passed a CIGA check. Surveys that resulted in "
"cancelled installs are excluded."
}
]
results = pd.concat(
[
results,
pd.DataFrame([totals_row]),
blank_row,
pd.DataFrame(headlines),
blank_row,
blank_row,
pd.DataFrame(assumptions)
]
)
with open("HA Remaining Analysis.csv", "w", newline="") as file:
# Write the DataFrame data without the index (adjust if you want the index).
results.to_csv(file, header=True, index=False)
def fml_data_pull(loader):
has_bruh = [
"HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13",
"HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12",
"HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49",
'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20",
]
# Can't pull from EPC database because it's based in Scotland
# "HAXXX", "HAXX"
# DO
from backend.SearchEpc import SearchEpc
epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA="
failed_has = []
for ha in has_bruh:
print(f"Pulling data for {ha}")
try:
asset_list = loader.data[ha]["asset_list"].copy()
# properties found as eligibile
fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"]
# For each property, search for the latest EPC
epc_data = []
for _, row in tqdm(fml.iterrows(), total=fml.shape[0]):
property_type, _ = get_property_type_and_built_form(property_meta=row, ha_name=ha)
if ha == "HAXXX":
to_join = [str(x) for x in
[row["Door Number"], row["Address Line 1"], row["Address Line 2"], row["Address Line 3"],
row["Postcode"]] if x is not None]
full_address = ", ".join(to_join)
else:
full_address = row["matching_address"]
searcher = SearchEpc(
address1=str(row["HouseNo"]),
postcode=row["matching_postcode"],
auth_token=epc_api_key,
os_api_key="",
property_type=property_type,
full_address=full_address,
fast=True
)
# Force the skipping of estimating the EPC
searcher.ordnance_survey_client.property_type = None
searcher.ordnance_survey_client.built_form = None
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
continue
epc = {
"asset_list_row_id": row["asset_list_row_id"],
**searcher.newest_epc.copy()
}
epc_data.append(epc)
# Remove None entries
epc_data = [x for x in epc_data if x is not None]
# Save the data in S3 as a parquet
epc_data_df = pd.DataFrame(epc_data)
save_pickle_to_s3(
data=epc_data_df,
bucket_name="retrofit-datalake-dev",
s3_file_name=f"ha-analysis/revised/{ha}/epc_data.pickle"
)
except Exception as e:
failed_has.append(ha)
def extract_lower_bound(age_band):
if pd.isna(age_band):
return 1930
try:
return int(age_band.split(':')[1].split('-')[0].strip())
except (ValueError, IndexError):
return 1930
def classify_loft(x):
# high confidence
if float(x["roof_insulation_thickness"]) <= 100:
return "high"
if float(x["roof_insulation_thickness"]) <= 200:
return "medium"
if float(x["roof_insulation_thickness"]) <= 270 and x["epc_age"] >= 5 * 365:
return "medium"
return "unlikely"
def fml_analysis(loader):
# In the case of the optimistic scenario, we assume that the at-risk pipeline is still viable, just at a lower rate
optimistic_scenario_rate = 1500
assumed_ciga_pass_rate = 0.731
has_bruh = [
"HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13",
"HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12",
"HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49",
'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20",
]
no_ciga_cavity_descriptions = [
"Cavity wall, as built, insulated (assumed)",
"Cavity wall, as built, no insulation (assumed)",
"Cavity wall, as built, partial insulation (assumed)",
"Cavity wall, no insulation (assumed)",
"Cavity wall, partial insulation (assumed)",
"Cavity wall,",
"Cavity wall, insulated (assumed)",
"Cavity wall, no insulation (assumed)",
"Cavity wall, as built, insulated (assumed)",
"Cavity wall, partial insulation (assumed)",
]
# TODO: There will be some properties that are subject to CIGA that do not look like they ned a CIGA check! pass
# them! Non-invasices will have checked the wall though
results = []
wall_descriptions = []
for ha_name in tqdm(has_bruh):
original_figures = loader.december_figures[
loader.december_figures["HA Name"] == ha_name
].copy()
original_remaining = original_figures["ECO4 remaining"].values[0]
original_gbis_remaining = original_figures["GBIS remaining"].values[0]
# Read in the epc data
asset_list = loader.data[ha_name]["asset_list"].copy()
# properties found as eligibile
fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"]
epc_data = read_pickle_from_s3(
bucket_name="retrofit-datalake-dev",
s3_file_name=f"ha-analysis/revised/{ha_name}/epc_data.pickle"
)
# We make sure we don't have duplicated. We do a super basic drop duplicates because it shouldn't be a huge
# issue at this point
epc_data = epc_data.drop_duplicates("uprn")
wall_descriptions.extend(epc_data["walls-description"].unique().tolist())
# time from the inspection to now
epc_data["epc_age"] = (datetime.now() - pd.to_datetime(epc_data["inspection-date"])).dt.days
if "estimated" not in epc_data.columns:
# For all after HA7, we don't use estimated surveys
epc_data["estimated"] = False
fuck_this = fml.merge(
epc_data, how="left", on="asset_list_row_id"
)
fuck_this["estimated"] = fuck_this["estimated"].fillna(True)
if fuck_this.shape[0] != fml.shape[0]:
raise Exception("What the fuck bruv")
# Take just remaining
if not loader.data[ha_name]["survey_list"].empty:
survey_list = (
loader.data[ha_name]["survey_list"][
~pd.isnull(loader.data[ha_name]["survey_list"]["asset_list_row_id"])
]
)
fuck_this = fuck_this.merge(
survey_list[["asset_list_row_id", "installation_status"]],
how="left",
on="asset_list_row_id"
)
# Anything that has an installation has gone to installation, and therefore is not remaining
fuck_this = fuck_this[pd.isnull(fuck_this["installation_status"])]
fuck_this = fuck_this.drop(columns=["installation_status"])
insulation_thicknesses = []
for _, x in fuck_this.iterrows():
if pd.isnull(x["roof-description"]):
continue
if x["roof-description"] == "SAP05:Roof":
continue
thickness = RoofAttributes(x["roof-description"]).process()["insulation_thickness"]
# If there is a + in the thickness, strip it out
thickness = str(thickness).replace("+", "")
insulation_thicknesses.append(
{'uprn': x["uprn"], "roof_insulation_thickness": thickness}
)
insulation_thicknesses = pd.DataFrame(insulation_thicknesses)
before_merge_shape = fuck_this.shape[0]
fuck_this = fuck_this.merge(insulation_thicknesses, how="left", on="uprn")
if fuck_this.shape[0] != before_merge_shape:
raise Exception("SOMETHING WENT WRONG")
# Automated archetype check
if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")):
# We perform the archetype test. If the property is a house, we it needs to be detached, semi-detached
# or end terrace. If it's a bungalow, it must be attached
fuck_this["passes_archetype"] = None
fuck_this["passes_archetype"] = np.where(
(fuck_this["property-type"] == "House") &
(fuck_this["built-form"].isin(["Semi-Detached", "End-Terrace", "Detached"])),
True,
fuck_this["passes_archetype"]
)
fuck_this["passes_archetype"] = np.where(
(fuck_this["property-type"] == "Bungalow") &
(fuck_this["built-form"].isin(["Detached"])),
True,
fuck_this["passes_archetype"]
)
fuck_this["ECO Eligibility"] = np.where(
(fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") &
(fuck_this["passes_archetype"] == True),
"eco4 (subject to ciga)",
fuck_this["ECO Eligibility"]
)
# If failed the archetype check and needs a CIGA, it's not eligibile
fuck_this["ECO Eligibility"] = np.where(
(fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") &
(fuck_this["passes_archetype"] != True),
"not eligible",
fuck_this["ECO Eligibility"]
)
fuck_this["ECO Eligibility"] = np.where(
(fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") &
(fuck_this["passes_archetype"] == True),
"eco4",
fuck_this["ECO Eligibility"]
)
fuck_this["ECO Eligibility"] = np.where(
(fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") &
(fuck_this["passes_archetype"] != True),
"gbis",
fuck_this["ECO Eligibility"]
)
if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")):
raise Exception("DO THE DAMN ARCHETYPE CHECK BRO")
# clean roof insulation
fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0")
fuck_this["roof_insulation_thickness"] = fuck_this[
"roof_insulation_thickness"
].str.replace("below average", "50")
fuck_this["roof_insulation_thickness"] = fuck_this[
"roof_insulation_thickness"
].str.replace("None", "0")
fuck_this["roof_insulation_thickness"] = fuck_this[
"roof_insulation_thickness"
].str.replace("none", "0")
fuck_this["roof_insulation_thickness"] = fuck_this[
"roof_insulation_thickness"
].str.replace("average", "150")
fuck_this["roof_insulation_thickness"] = fuck_this[
"roof_insulation_thickness"
].str.replace("above 150", "150")
fuck_this["roof_classiciation"] = fuck_this.apply(lambda x: classify_loft(x), axis=1)
had_survey = fuck_this[fuck_this["estimated"] == False]
# proportion with a survey:
proportion_with_survey = 100 * had_survey.shape[0] / fuck_this.shape[0]
# Let's look just at the ECO4 business
# For things that had a survey, take the properties that didn't need a CIGA check
no_ciga_check_needed = had_survey[
had_survey["ECO Eligibility"] == "eco4"
]
no_ciga_check_needed_eligible = no_ciga_check_needed[
(no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) &
(no_ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) &
(no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80)
]
# For anything not needing a CIGA check, some of it will be GBIS
no_ciga_check_needed_eligible_gbis = no_ciga_check_needed[
(no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) &
(no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) &
(~no_ciga_check_needed["asset_list_row_id"].isin(no_ciga_check_needed_eligible["asset_list_row_id"].values))
]
# Characterise no CIGA check needed
# !!!!!!!!!!!! AT RISK !!!!!!!!!!!!
ciga_check_passed = had_survey[had_survey["ECO Eligibility"] == "eco4 - passed ciga"]
# These should be treated the same as one that have passed their ciga checks, from a detection perspective
ciga_check_passed_eligible = ciga_check_passed[
(ciga_check_passed["walls-description"].str.lower().str.contains("cavity") == True) &
(ciga_check_passed["roof_classiciation"].isin(["high", "medium"])) &
(ciga_check_passed["current-energy-efficiency"].astype(float) <= 80)
]
if not loader.data[ha_name]["ciga_list"].empty:
proportions = loader.data[ha_name]["ciga_list"]["Guarantee"].value_counts(normalize=True)
ha_ciga_pass_rate = proportions[proportions.index == "No"].values[0]
else:
ha_ciga_pass_rate = assumed_ciga_pass_rate
# We take just the cavity walls
# UCL paper: https://discovery.ucl.ac.uk/id/eprint/10110371/
# This paper is based on London properties
# The proportion of EPCs with building characteristics errors are shown to
# differ between variables; floor and wall type errors occur in ~10-15% of EPCs,
# compared with ~5% for wall insulation and glazing performance
ciga_check_needed = had_survey[
had_survey["ECO Eligibility"].str.contains("subject to ciga")
].copy()
ciga_check_needed_eligible = ciga_check_needed[
(ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) &
(ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) &
(ciga_check_needed["current-energy-efficiency"].astype(float) <= 80)
]
# Finally, characterise gbis properties. Some of the business might look like ECO4 work, whereas we then
# qualify what actually looks like gbis
gbis_identified = had_survey[
had_survey["ECO Eligibility"] == "gbis"
].copy()
gbis_looks_like_eco4 = gbis_identified[
(gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) &
(gbis_identified["roof_classiciation"].isin(["high", "medium"])) &
(gbis_identified["current-energy-efficiency"].astype(float) <= 80) &
(
(
(gbis_identified["property-type"] == "House") &
(gbis_identified["built-form"] != "Mid-Terrace")
) | (
(gbis_identified["property-type"] == "Bungalow") &
(gbis_identified["built-form"].isin(["Detached"]))
)
)
]
gbis_qualified = gbis_identified[
(gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) &
(gbis_identified["current-energy-efficiency"].astype(float) <= 80) &
(~gbis_identified["asset_list_row_id"].isin(gbis_looks_like_eco4["asset_list_row_id"].values))
]
ciga_check_expectation = np.round(ciga_check_needed_eligible.shape[0] * ha_ciga_pass_rate)
without_ciga_expectation = no_ciga_check_needed_eligible.shape[0]
passed_ciga_expectation = ciga_check_passed_eligible.shape[0]
identified_as_gbis_looks_like_eco4 = gbis_looks_like_eco4.shape[0]
# Need to add on the non-ciga
total_eco4_expectation = (
ciga_check_expectation +
without_ciga_expectation +
passed_ciga_expectation +
identified_as_gbis_looks_like_eco4
)
# This is the work that is at risk
eco4_work_at_risk = (
passed_ciga_expectation +
ciga_check_expectation
)
no_ciga_check_needed_actually_gbis = no_ciga_check_needed_eligible_gbis.shape[0]
gbis_qualified = gbis_qualified.shape[0]
total_gbis_expectation = no_ciga_check_needed_actually_gbis + gbis_qualified
if proportion_with_survey < 100:
# We estimate the rest
without_survey_needing_ciga = fuck_this[
(fuck_this["estimated"] == True) &
(fuck_this["ECO Eligibility"].str.contains("subject to ciga") == True)
]
if without_survey_needing_ciga.empty:
without_survey_without_ciga_expected = 0
else:
# We apply the same conversion rate as the properties with a survey
if ciga_check_needed.shape[0] == 0 and ciga_check_expectation == 0:
without_survey_without_ciga_expected = without_survey_needing_ciga.shape[0]
else:
without_survey_without_ciga_expected = np.round(
without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0])
)
without_survey_passed_ciga = fuck_this[
(fuck_this["estimated"] == True) &
(fuck_this["ECO Eligibility"] == "eco4 - passed ciga")
]
if without_survey_passed_ciga.empty:
without_survey_passed_ciga_expected = 0
else:
# We apply the same conversion rate as the properties with a survey
without_survey_passed_ciga_expected = np.round(
without_survey_passed_ciga.shape[0] * (passed_ciga_expectation / ciga_check_passed.shape[0])
)
# Finally, no ciga needed
without_survey_eco4 = fuck_this[
(fuck_this["estimated"] == True) &
(fuck_this["ECO Eligibility"] == "eco4")
]
if without_survey_eco4.empty:
without_survey_eco4_expected = 0
without_survey_gbis_expected = 0
else:
# We apply the same conversion rate as the properties with a survey
without_survey_eco4_expected = np.round(
without_survey_eco4.shape[0] * (without_ciga_expectation / no_ciga_check_needed.shape[0])
)
without_survey_gbis_expected = np.round(
without_survey_eco4.shape[0] * (total_gbis_expectation / no_ciga_check_needed.shape[0])
)
# And gbis
without_survey_gbis = fuck_this[
(fuck_this["estimated"] == True) &
(fuck_this["ECO Eligibility"] == "gbis")
]
if without_survey_gbis.empty:
without_survey_identified_as_gbis_qualified = 0
without_survey_identified_as_gbis_eco4 = 0
else:
# We apply the same conversion rate as the properties with a survey
without_survey_identified_as_gbis_qualified = np.round(
without_survey_gbis.shape[0] * (gbis_qualified / gbis_identified.shape[0])
)
without_survey_identified_as_gbis_eco4 = np.round(
without_survey_eco4.shape[0] * (identified_as_gbis_looks_like_eco4 / gbis_identified.shape[0])
)
total_eco4_expectation = (
total_eco4_expectation +
without_survey_without_ciga_expected +
without_survey_passed_ciga_expected +
without_survey_eco4_expected +
without_survey_identified_as_gbis_eco4
)
total_gbis_expectation = (
total_gbis_expectation +
without_survey_gbis_expected +
without_survey_identified_as_gbis_qualified
)
results.append(
{
"HA Name": ha_name,
"Original ECO4 Estimate - Remaining": original_remaining,
"Original GGBIS Estimate - Remaining": original_gbis_remaining,
# "Postcode List - Remaining": postcode_list_remaining,
# "Of which sold": sales_since_nov,
"EPC verified ECO4 Eligible - Remaining": int(total_eco4_expectation),
"EPC verified GBIS Eligibile - Remaining": int(total_gbis_expectation),
# At risk work
"Work at risk due to audits": eco4_work_at_risk
}
)
results_df = pd.DataFrame(results)
results_df.to_csv("analysis - revised - audit update.csv")
# results_df["Delta vs November"] = 100 * (
# results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"]
# ) / results_df["Original ECO4 Estimate - Remaining"]
# TODO: Add in estimated GBIS (for eco jobs, of which look like gbis)
# TODO: Change the left hand side number for our post CIGA estimates
def create_final_report():
"""
This function will produce the final output for the HA analysis
:return:
"""
epc_validated_results = pd.read_csv("analysis - revised - audit update.csv")
pipeline_results = pd.read_csv("pipeline_remaining_raw.csv")
####################################
# Original Warmfront estimates
####################################
# Create the volumes result
all_ha_summary_remaining = pipeline_results[
[
"('', '', '', 'HA Name')",
"('ECO4 original', '', 'Remaining - #', '')",
"('GBIS original', '', 'Remaining - #', '')",
]
].copy().rename(
columns={
"('', '', '', 'HA Name')": "HA Name",
"('ECO4 original', '', 'Remaining - #', '')": "# ECO4 remaining - All HA Summary",
"('GBIS original', '', 'Remaining - #', '')": "# GBIS remaining - All HA Summary",
}
)
all_ha_summary_remaining["# Total remaining - All HA Summary"] = (
all_ha_summary_remaining["# ECO4 remaining - All HA Summary"] +
all_ha_summary_remaining["# GBIS remaining - All HA Summary"]
)
all_ha_summary_remaining = all_ha_summary_remaining.sort_values("HA Name")
####################################
# Postcode list - pre-CIGA
####################################
postcode_list_pre_ciga_remaining = pipeline_results[
[
"('', '', '', 'HA Name')",
"('ECO4 pre-ciga', '', 'Remaining - #', '')",
"('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')",
]
].copy().rename(
columns={
"('', '', '', 'HA Name')": "HA Name",
"('ECO4 pre-ciga', '', 'Remaining - #', '')": "# ECO4 remaining - Postcode list (pre CIGA)",
"('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": (
"# GBIS remaining - Postcode list (pre CIGA)"
),
}
)
postcode_list_pre_ciga_remaining["# Total remaining - Postcode list (pre CIGA)"] = (
postcode_list_pre_ciga_remaining["# ECO4 remaining - Postcode list (pre CIGA)"] +
postcode_list_pre_ciga_remaining["# GBIS remaining - Postcode list (pre CIGA)"]
)
postcode_list_pre_ciga_remaining = postcode_list_pre_ciga_remaining.sort_values("HA Name")
####################################
# Postcode list - post-CIGA
####################################
postcode_list_post_ciga_remaining = pipeline_results[
[
"('', '', '', 'HA Name')",
"('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')",
"('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')",
]
].copy().rename(
columns={
"('', '', '', 'HA Name')": "HA Name",
"('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')":
"# ECO4 remaining - Postcode list (post CIGA)",
"('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": (
"# GBIS remaining - Postcode list (post CIGA)"
),
}
)
postcode_list_post_ciga_remaining["# Total remaining - Postcode list (post CIGA)"] = (
postcode_list_post_ciga_remaining["# ECO4 remaining - Postcode list (post CIGA)"] +
postcode_list_post_ciga_remaining["# GBIS remaining - Postcode list (post CIGA)"]
)
postcode_list_post_ciga_remaining = postcode_list_post_ciga_remaining.sort_values("HA Name")
####################################
# From EPC Database
####################################
from_epc_database = epc_validated_results[
[
"HA Name",
"EPC verified ECO4 Eligible - Remaining",
"EPC verified GBIS Eligibile - Remaining",
"Work at risk due to audits"
]
].copy().rename(
columns={
"EPC verified ECO4 Eligible - Remaining": "# ECO4 remaining - From EPC Database (post CIGA)",
"EPC verified GBIS Eligibile - Remaining": "# GBIS remaining - From EPC Database (post CIGA)",
"Work at risk due to audits": "ECO4 remaining work at risk due to Audits",
}
)
from_epc_database["# Total remaining - From EPC Database (post CIGA)"] = (
from_epc_database["# ECO4 remaining - From EPC Database (post CIGA)"] +
from_epc_database["# GBIS remaining - From EPC Database (post CIGA)"]
)
from_epc_database = from_epc_database.sort_values("HA Name")
# Combine the datasets
volumes = all_ha_summary_remaining.merge(
postcode_list_pre_ciga_remaining, how="left", on="HA Name"
).merge(
postcode_list_post_ciga_remaining, how="left", on="HA Name"
).merge(
from_epc_database, how="inner", on="HA Name"
)
revenue = volumes.copy()
# Convert the ECO4 volumes to revenue
for col in [
'# ECO4 remaining - All HA Summary',
'# ECO4 remaining - Postcode list (pre CIGA)',
'# ECO4 remaining - Postcode list (post CIGA)',
'# ECO4 remaining - From EPC Database (post CIGA)',
'ECO4 remaining work at risk due to Audits'
]:
revenue[col] = revenue[col] * 1710
# Convert the GBIS volumes to revenue
for col in [
'# GBIS remaining - All HA Summary',
'# GBIS remaining - Postcode list (pre CIGA)',
'# GBIS remaining - Postcode list (post CIGA)',
'# GBIS remaining - From EPC Database (post CIGA)'
]:
revenue[col] = revenue[col] * 600
# Re-calculate the totals
revenue['# Total remaining - All HA Summary'] = (
revenue['# ECO4 remaining - All HA Summary'] + revenue['# GBIS remaining - All HA Summary']
)
revenue['# Total remaining - Postcode list (pre CIGA)'] = (
revenue['# ECO4 remaining - Postcode list (pre CIGA)'] + revenue['# GBIS remaining - Postcode list (pre CIGA)']
)
revenue['# Total remaining - Postcode list (post CIGA)'] = (
revenue['# ECO4 remaining - Postcode list (post CIGA)'] + revenue[
'# GBIS remaining - Postcode list (post CIGA)']
)
revenue['# Total remaining - From EPC Database (post CIGA)'] = (
revenue['# ECO4 remaining - From EPC Database (post CIGA)'] +
revenue['# GBIS remaining - From EPC Database (post CIGA)']
)
# Replace the # with £ in the columns
revnue_colnames = [col.replace("#", "£") for col in revenue.columns]
revenue.columns = revnue_colnames
# We check that each column gets smaller
decreasing_check1 = all(
volumes["# ECO4 remaining - Postcode list (pre CIGA)"] >= volumes[
'# ECO4 remaining - Postcode list (post CIGA)']
)
if not decreasing_check1:
raise ValueError("decreasing_check1 failed")
# Just HA32 and HA17 should fail this, and it's due to GBIS jobs looking like ECO4
decreasing_check2 = volumes[volumes["# ECO4 remaining - From EPC Database (post CIGA)"] > volumes[
"# ECO4 remaining - Postcode list (post CIGA)"]]
if set(decreasing_check2["HA Name"].tolist()) != {"HA17", "HA32"}:
raise ValueError("decreasing_check2 failed")
# Check for GBIS
decreasing_check3 = all(
volumes["# GBIS remaining - Postcode list (pre CIGA)"] >= volumes[
'# GBIS remaining - Postcode list (post CIGA)']
)
if not decreasing_check3:
raise ValueError("decreasing_check3 failed")
# Don't perform this - this happens for multiple
# decreasing_check4 = volumes[volumes["# GBIS remaining - From EPC Database (post CIGA)"] > volumes[
# "# GBIS remaining - Postcode list (post CIGA)"]]
# Store final outputs
volumes.to_csv("HA Analysis - Audit Update - volumes.csv")
revenue.to_csv("HA Analysis - Audit Update - revenue.csv")
def identify_eco_works(loader):
# ha_names = [
# "HA16", # For Housing
# "HA39", # Rooftop
# "HA41", # Settle
# "HA23", # Lambeth
# "HA14", # EMH
# "HA7", # Believe
# "HA102", # Thrive
# ]
# Unitas, fairhive, acis, LHP
ha_names = [
"HA50", # Unitas
"HA15", # Fairhive
"HA107", # ACIS
"HA24", # LHP
]
names = {
"HA50": "Unitas",
"HA15": "Fairhive",
"HA107": "ACIS",
"HA24": "LHP"
}
# gbis rate
breakdowns = []
# lists = {}
for ha, data_assets in loader.data.items():
if ha not in ha_names:
continue
asset_list = data_assets["asset_list"].copy()
survey_list = data_assets["survey_list"].copy()
# Remove things that have sold
if not survey_list.empty:
asset_list = asset_list.merge(
survey_list[["asset_list_row_id", "installation_status"]],
how="left",
on="asset_list_row_id"
)
# Anything that has an installation has gone to installation, and therefore is not remaining
asset_list = asset_list[pd.isnull(asset_list["installation_status"])]
asset_list = asset_list.drop(columns=["installation_status"])
# Needing a CIGA check
needs_cga = asset_list[
asset_list["ECO Eligibility"] == "eco4 (subject to ciga)"
].copy()
eco4 = asset_list[
asset_list["ECO Eligibility"] == "eco4"
].copy()
eco4_passed_ciga = asset_list[
asset_list["ECO Eligibility"] == "eco4 - passed ciga"
].copy()
# lists[ha] = {
# "needs_cga": needs_cga,
# "eco4": eco4,
# "eco4_passed_ciga": eco4_passed_ciga
# }
# Store the data
if not needs_cga.empty:
needs_cga.to_csv(f"local_data/{names[ha]} - needs ciga.csv")
if not eco4.empty:
eco4.to_csv(f"local_data/{names[ha]} - eco4.csv")
if not eco4_passed_ciga.empty:
eco4_passed_ciga.to_csv(f"local_data/{names[ha]} - eco4 passed ciga.csv")
summary = {
"HA Name": ha,
"n_needing_ciga": needs_cga.shape[0],
"eco4": eco4.shape[0],
"eco4_passed_ciga": eco4_passed_ciga.shape[0]
}
breakdowns.append(summary)
breakdowns = pd.DataFrame(breakdowns)
breakdowns = breakdowns.fillna(0)
def unitas_data_prep(loader):
#####
# Adhoc - for UNITAS, stripping out additional surveys that have been completed
unitas_data = loader.data["HA50"].copy()
unitas_asset_list = unitas_data["asset_list"].copy()
unitas_survey_sheet = unitas_data["survey_list"].copy()
# We remove the surveyed properties from the asset sheet
unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])]
unitas_asset_list = unitas_asset_list.merge(
unitas_survey_sheet[["asset_list_row_id", "installation_status"]],
how="left",
on="asset_list_row_id"
)
unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])]
unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"])
# We read in the data for the further completed surveys
unitas_phase_1_workbook = openpyxl.load_workbook(
"local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx"
)
phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"]
phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"]
phase1_colnames = [cell.value for cell in phase_1_worksheet[1]]
phase_1_rows_data = []
for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
phase_1_rows_data.append(row_data)
phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames)
# Correct phase 1 surveys in the same fashion as the previous approach
phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy())
# We check all phase 1 surveys are contained in the data we had before
additional = []
for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)):
# We look for the entry in the old survey sheet:
# matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]]
# if matched_uprn.shape[0] == 1:
# continue
matched_1 = unitas_survey_sheet[
(unitas_survey_sheet["Post Code"] == row["Post Code"]) &
(unitas_survey_sheet["NO."] == row["NO."])
]
if matched_1.shape[0] == 1:
continue
matched_2 = unitas_survey_sheet[
(unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) &
(unitas_survey_sheet["NO."] == row["NO."])
]
if matched_2.shape[0] == 1:
continue
additional.append(row.to_dict())
additional = pd.DataFrame(additional)
phase_2_rows_data = []
for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
phase_2_rows_data.append(row_data)
phase2_colnames = [cell.value for cell in phase_2_worksheet[1]]
phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames)
# Drop all of the occurances of "OFFICE USE ONLY" columns
phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c])
common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns})
additional_filtered = additional[common_columns]
further_unitas_completed_surveys = pd.concat(
[phase_2_surveys, additional_filtered],
axis=0,
ignore_index=True
)
# Add a phase 2 key
further_unitas_completed_surveys["survey_list_row_id"] = [
"unitas_phase_2" + str(i) for i in further_unitas_completed_surveys.index
]
not_in_asset_list = [
"unitas_phase_20", "unitas_phase_234", "unitas_phase_2163", "unitas_phase_2173", "unitas_phase_2374"
]
additional_postcodes = ["st28bg"]
full_asset_list = unitas_data["asset_list"].copy()
full_asset_list["matching_postcode"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
further_unitas_completed_surveys["Post Code"] = further_unitas_completed_surveys["Post Code"].str.replace(
"ST 5DT", "ST3 5DT"
)
# We match these back to the asset list
matching_lookup = []
for _, row in tqdm(further_unitas_completed_surveys.iterrows(), total=len(further_unitas_completed_surveys)):
if row["survey_list_row_id"] in not_in_asset_list:
continue
postcode_lower = row["Post Code"].lower().strip().replace(" ", "")
if postcode_lower in additional_postcodes:
continue
# Confirmed not in asset lsit
# Filter asset list on postcode
df = full_asset_list[
full_asset_list["matching_postcode"].str.contains(postcode_lower)
]
df = df[df["HouseNo"] == str(row["NO."])]
if df.shape[0] != 1:
raise Exception("NOT FOUND")
matching_lookup.append(
{
"survey_list_row_id": row["survey_list_row_id"],
"asset_list_row_id": df["asset_list_row_id"].values[0],
}
)
matching_lookup = pd.DataFrame(matching_lookup)
matching_lookup["phase_2_surveyed"] = True
# We merge this onto the asset list and remove the rows
unitas_asset_list = unitas_asset_list.merge(
matching_lookup, how="left", on="asset_list_row_id"
)
# Drop rows where phase_2_surveyed is populated
unitas_asset_list = unitas_asset_list[
pd.isnull(unitas_asset_list["phase_2_surveyed"])
]
# We add in the new CIGA submissions
unitas_round_2_ciga_workbook = openpyxl.load_workbook("local_data/ha_data/Unitas second round CIGA checks.xlsx")
ciga_round_2_worksheet = unitas_round_2_ciga_workbook["Worksheet"]
ciga_round_2_colnames = [cell.value for cell in ciga_round_2_worksheet[1]]
round_2_rows_data = []
for row in ciga_round_2_worksheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
round_2_rows_data.append(row_data)
ciga_round_2 = pd.DataFrame(round_2_rows_data, columns=ciga_round_2_colnames)
# We merge the ciga sheet to the asset list
ciga_dependent_asset_list = unitas_asset_list[
unitas_asset_list["ECO Eligibility"].str.contains("subject to ciga")
].copy()
# We merge the ciga sheet to the asset list
ciga_round_2_matched = ciga_dependent_asset_list.merge(
ciga_round_2, how="inner", on=["Address Line 1", "Post Code"]
)
# Filter on just the properties that had no guarantee
ciga_round_2_matched = ciga_round_2_matched[ciga_round_2_matched["Guarantee"] == "No"]
# ECO Eligibility
# not eligible 9227
# failed ciga 2711
# eco4 (subject to ciga) 2238
# eco4 - passed ciga 901
# gbis 114
# eco4 91
# We filter on the properties we're looking to re-survey
unitas_properties_to_survey = unitas_asset_list[
unitas_asset_list["ECO Eligibility"].isin(
[
"eco4 - passed ciga",
"eco4"
]
)
].copy()
unitas_properties_to_survey = pd.concat(
[
unitas_properties_to_survey,
ciga_round_2_matched[unitas_properties_to_survey.columns]
]
)
epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA="
# We now retrieve the lastest EPC data
epc_data = []
for _, unitas_property in tqdm(unitas_properties_to_survey.iterrows(), total=len(unitas_properties_to_survey)):
property_type, _ = get_property_type_and_built_form(property_meta=unitas_property, ha_name="HA50")
full_address = unitas_property["matching_address"]
searcher = SearchEpc(
address1=str(unitas_property["HouseNo"]),
postcode=unitas_property["matching_postcode"],
auth_token=epc_api_key,
os_api_key="",
property_type=property_type,
full_address=full_address,
fast=True
)
# Force the skipping of estimating the EPC
searcher.ordnance_survey_client.property_type = None
searcher.ordnance_survey_client.built_form = None
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
continue
epc = {
"asset_list_row_id": unitas_property["asset_list_row_id"],
**searcher.newest_epc.copy()
}
epc_data.append(epc)
epc_df = pd.DataFrame(epc_data)
# Pull out just the columns we need
epc_df = epc_df[
[
"asset_list_row_id",
"address1", "postcode",
"current-energy-efficiency",
"current-energy-rating",
"inspection-date",
"transaction-type",
"built-form"
]
]
epc_df["EPC Rating"] = (
epc_df["current-energy-efficiency"].astype(str) +
epc_df["current-energy-rating"].astype(str)
)
# Merge onto the Unitas data:
unitas_properties_to_survey_full = unitas_properties_to_survey.merge(
epc_df[
[
"asset_list_row_id",
"EPC Rating",
"inspection-date",
"transaction-type",
"built-form"
]
],
how="left",
on="asset_list_row_id"
)
unitas_properties_to_survey_full["ECO Eligibility"] = unitas_properties_to_survey_full["ECO Eligibility"].replace(
"eco4 (subject to ciga)", "eco4 - passed ciga, phase 2 check"
)
for col in ["EPC Rating", "inspection-date", "transaction-type", "built-form"]:
unitas_properties_to_survey_full[col] = np.where(
pd.isnull(unitas_properties_to_survey_full[col]),
"No EPC found",
unitas_properties_to_survey_full[col]
)
unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].fillna(
"No EPC found"
)
unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].astype(str)
unitas_properties_to_survey_full = unitas_properties_to_survey_full.rename(
columns={
"inspection-date": "Last EPC Inspection Date",
"transaction-type": "Last EPC Reason",
"built-form": "Last EPC Built Form",
}
)
# We now match to the survey outcomes
unitas_survey_outcomes_workbook = openpyxl.load_workbook(
"local_data/ha_data/UNITAS - survey outcomes 26.03.2024.xlsx"
)
unitas_survey_outcomes_worksheet = unitas_survey_outcomes_workbook["OUTCOMES"]
unitas_outcomes_colnames = [cell.value for cell in unitas_survey_outcomes_worksheet[2]]
outcomes_rows_data = []
for row in unitas_survey_outcomes_worksheet.iter_rows(min_row=3, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
outcomes_rows_data.append(row_data)
unitas_outcomes = pd.DataFrame(outcomes_rows_data, columns=unitas_outcomes_colnames)
unitas_outcomes = unitas_outcomes.rename(
columns={
"Notes (If 'no answer' under outcomes, have you checked around the property for access "
"issues where possible?)": "Notes"
}
)
unitas_outcomes["Postcode"].unique()
eg1 = unitas_properties_to_survey_full[
(unitas_properties_to_survey_full["Post Code"] == "ST6 6RF")
]
eg1_outcomes = unitas_outcomes[
(unitas_outcomes["Postcode"] == "ST6 6RF")
]
# Merge outcomes onto properties to survey. Will probably have to do algorithmically
full_asset_list["matching_postcode_nospace"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
outcome_matching = []
for _, outcome in tqdm(unitas_outcomes.iterrows(), total=len(unitas_outcomes)):
# We search for the corresponding entry in the asset list
postcode_lower = outcome["Postcode"].lower().strip().replace(" ", "")
# Confirmed not in asset lsit
# Filter asset list on postcode
df = unitas_properties_to_survey_full[
unitas_properties_to_survey_full["matching_postcode_nospace"].str.contains(postcode_lower)
]
df = df[df["HouseNo"] == str(outcome["No."])]
if df.empty:
continue
if df.shape[0] == 1:
outcome_matching.append(
{
"asset_list_row_id": df["asset_list_row_id"].values[0],
**outcome.to_dict()
}
)
continue
raise Exception("something went wrong")
outcome_matching = pd.DataFrame(outcome_matching)
# We can have duplicate matches, so we format the Date letter sent column and retrieve the newest outcome
outcome_matching["Date letters sent"] = outcome_matching["Date letters sent"].str.lower()
outcome_matching["Extracted Date"] = outcome_matching["Date letters sent"].str.extract(
r'(?:w[./]c )(\d{2}\.\d{2}\.\d{4})')
outcome_matching["Extracted Date"] = pd.to_datetime(outcome_matching["Extracted Date"], format='%d.%m.%Y')
# We sort by asset_list_row_id and extracted date, and retrieve the newest
outcome_matching = outcome_matching.sort_values(["asset_list_row_id", "Extracted Date"], ascending=[True, False])
# Some properties will have multiple outcomes - for these, we re-format
outcome_matching_grouped = []
for asset_list_row_id, grouped_data in outcome_matching.groupby("asset_list_row_id"):
if grouped_data.shape[0] == 1:
outcome_matching_grouped.append(
{
"Number of previous visits": 1,
**grouped_data.to_dict("records")[0]
}
)
continue
if grouped_data.shape[0] == 2:
newest_visit = grouped_data.head(1)
oldest_visit = grouped_data.tail(1)[['Outcomes', 'Surveyor', 'Notes', 'Date letters sent']].add_suffix(
" second visit")
to_append = {
"Number of previous visits": 2,
**newest_visit.to_dict("records")[0],
**oldest_visit.to_dict("records")[0]
}
outcome_matching_grouped.append(to_append)
else:
raise Exception("something went wrong")
outcome_matching_grouped = pd.DataFrame(outcome_matching_grouped)
unitas_properties_to_survey_with_outcomes = unitas_properties_to_survey_full.merge(
outcome_matching_grouped, how="left", on="asset_list_row_id"
)
unitas_properties_to_survey_with_outcomes["Number of previous visits"] = (
unitas_properties_to_survey_with_outcomes["Number of previous visits"].fillna(0)
)
# Store as an excel
unitas_properties_to_survey_with_outcomes.to_excel("Unitas - phase 2 properties to Survey.xlsx")
unitas_properties_to_survey_with_outcomes["Last EPC Built Form"].value_counts()
def app():
"""
This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107.
Only HA 6 has surveys
:return:
"""
# Determines if we want to use the cached data in s3
use_cache = True
# Determines if we want to perform the data pull
pull_data = False
# Override to re-build all inputs
rebuild_inputs = False
# List all of the data in the folder
directories = [str(file) for entry in DATA_FOLDER.iterdir() if entry.is_dir()
for file in entry.iterdir() if file.suffix == '.xlsx']
# Grab the December HA figures filepath
december_figures_filepath = "local_data/ha_data/HA_December_figures.csv"
# Add in:
priority_has = [
"HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24",
"HA25", "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54",
"HA56", "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42",
# Added as of March 18th
"HA44", "HA45", "HA51", "HA52", "HA17", "HA5", "HA20",
# New HAS
"HAXX", "HAXXX",
]
# Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come
# back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE],
# 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE], 31 [DONE], 54 [DONE]
#
# Consider for ECO4:
# HA 70 - have to merge ECO3 list though,
# HA17 has LOTs of assets, but the asset list is a mess
# HA53 but has EPCs done
# Consider for GBIS:
# Ignore for now:
# 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in
# Filter down the directories to only the priority HAs
directories = [d for d in directories if d.split("/")[2] in priority_has]
loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs)
loader.load()
loader.ha_facts_and_figures()
# import pickle
# with open("ha_analysis_data_temp.pkl", "wb") as f:
# pickle.dump(loader, f)
# import pickle
# with open("ha_analysis_data_temp.pkl", "rb") as f:
# loader = pickle.load(f)
forecast_remaining_sales(loader)
# Functions to produce the final output lol...
# fml_data_pull(loader) # If we need to pull EPC data
fml_analysis(loader)
create_final_report()
# Adhoc - for HA16, get the properties that still need a CIGA check
# asset_list_ha16 = loader.data["HA16"]["asset_list"].copy()
# ha_16_need_ciga = asset_list_ha16[
# asset_list_ha16["ECO Eligibility"].str.contains("subject to ciga")
# ]
# completed_cigas = loader.data["HA16"]["ciga_list"].copy()
# # Store the results
# ha_16_need_ciga.to_csv("ha16_need_ciga.csv")
# completed_cigas.to_csv("ha16_completed_cigas.csv")
#
# # Adhoc - look at the current pipeline and identify how many dormant, CIGA dependent properties there are for
# # live projects
#
# # Read excel
# orderbook_filepath = "local_data/ha_data/Warmfront HA client order book overview_20240129.xlsx"
# orderbook_workbook = openpyxl.load_workbook(orderbook_filepath)
# orderbook_sheet = orderbook_workbook["Contractual Info"]
# orderbook_colnames = [cell.value for cell in orderbook_sheet[1]]
#
# rows = []
# for row in orderbook_sheet.iter_rows(min_row=2, values_only=False):
# row_data = [cell.value for cell in row] # This will get you the cell values
# rows.append(row_data)
#
# orderbook = pd.DataFrame(rows, columns=orderbook_colnames)
# live_orderbook = orderbook[orderbook["Live, New, or Historic?"] == "LIVE"].copy()
# live_orderbook['Redacted HA'] = live_orderbook['Redacted HA'].str.replace(" ", "")
#
# dormant_properties = []
# missed_has = []
# for _, customer in live_orderbook.iterrows():
# if customer['Redacted HA'] not in loader.data.keys():
# missed_has.append(customer['Redacted HA'])
# continue
# asset_list = loader.data[customer['Redacted HA']]["asset_list"].copy()
# survey_list = loader.data[customer['Redacted HA']]["survey_list"].copy()
# # Remove sold
# if not survey_list.empty:
# survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])]
# asset_list = asset_list.merge(
# survey_list[["asset_list_row_id", "installation_status"]],
# how="left",
# on="asset_list_row_id"
# )
# # Anything that has an installation has gone to installation, and therefore is not remaining
# asset_list = asset_list[pd.isnull(asset_list["installation_status"])]
# asset_list = asset_list.drop(columns=["installation_status"])
#
# # We pull out the properties that need a CIGA check
# need_ciga = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to ciga)"]
# need_archetype = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to archetype)"]
# need_ciga_and_archetype = asset_list[
# asset_list["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)"
# ]
#
# dormant_properties.append(
# {
# "HA Name": customer['Redacted HA'],
# "Need CIGA": need_ciga.shape[0],
# "Need Archetype": need_archetype.shape[0],
# "Need CIGA and Archetype": need_ciga_and_archetype.shape[0]
# }
# )
#
# dormant_properties = pd.DataFrame(dormant_properties)
# totals = dormant_properties.sum()
# totals["HA Name"] = "Total"
#
# dormant_properties = pd.concat([dormant_properties, totals.to_frame().T])
# dormant_properties.to_csv("dormant_properties.csv")
#
# loader.december_figures["ECO4 remaining"].sum()
# december_figures = loader.december_figures.copy()
# december_figures["ECO4 remaining"] = np.where(
# december_figures["ECO4 remaining"] < 0,
# 0,
# december_figures["ECO4 remaining"]
# )
# december_figures["ECO4 remaining"].sum()