mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
7330 lines
316 KiB
Python
7330 lines
316 KiB
Python
import os
|
|
import re
|
|
import openpyxl
|
|
from fuzzywuzzy import fuzz
|
|
from pathlib import Path
|
|
import msgpack
|
|
from datetime import datetime
|
|
import pandas as pd
|
|
import numpy as np
|
|
from utils.s3 import (
|
|
read_from_s3, read_dataframe_from_s3_parquet, save_pickle_to_s3, read_pickle_from_s3, save_dataframe_to_s3_parquet
|
|
)
|
|
from utils.logger import setup_logger
|
|
from dotenv import load_dotenv
|
|
from tqdm import tqdm
|
|
from backend.SearchEpc import SearchEpc
|
|
from etl.eligibility.Eligibility import Eligibility
|
|
from etl.eligibility.ha_15_32.app import prepare_model_data_row
|
|
from backend.ml_models.api import ModelApi
|
|
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
|
|
from recommendations.recommendation_utils import calculate_cavity_age
|
|
from etl.epc.Record import EPCRecord
|
|
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
|
from etl.epc.DataProcessor import EPCDataProcessor
|
|
from datetime import datetime
|
|
|
|
import inspect
|
|
|
|
src_file_path = inspect.getfile(lambda: None)
|
|
|
|
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
|
ENV_FILE = Path(src_file_path).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
|
|
DATA_FOLDER = Path(src_file_path).parent / "local_data" / "ha_data"
|
|
|
|
logger = setup_logger()
|
|
load_dotenv(ENV_FILE)
|
|
|
|
PROPERTY_TYPE_LOOKUP = {
|
|
"HA1": {
|
|
"built_form": {
|
|
'Mid Terrace': 'Mid-Terrace',
|
|
'Semi-Detached': 'Semi-Detached',
|
|
'End Terrace': 'End-Terrace',
|
|
'Detached': 'Detached',
|
|
'Enclosed Mid': 'Mid-Terrace',
|
|
'Detached Local Connect': 'Detached',
|
|
}
|
|
},
|
|
"HA2": {
|
|
'HOUSE': 'House',
|
|
'FLAT': 'Flat',
|
|
'SHELTERED': None,
|
|
'BUNGALOW': 'Bungalow',
|
|
'BED-SIT': None,
|
|
'MAISONETTE': "Maisonette",
|
|
'HOSTEL': None
|
|
},
|
|
"HA5": {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
"Bungalow": "Bungalow",
|
|
"Bedsit": None
|
|
},
|
|
"HA6": {
|
|
"property_type": {
|
|
'HOUSE': "House",
|
|
'GROUND FLOOR FLAT': "Flat",
|
|
'UPPER FLOOR FLAT': "Flat",
|
|
'MAISONETTE': "Maisonette",
|
|
'BUNGALOW': "Bungalow",
|
|
'WARDEN BUNGALOW': "Bungalow",
|
|
'WARDEN FLAT': "Flat",
|
|
'EXTRACARE SCHEME': "Flat",
|
|
}
|
|
},
|
|
"HA7": {
|
|
"property_type": {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
"Bungalow": "Bungalow",
|
|
"Maisonette": "Maisonette",
|
|
},
|
|
"built_form": {
|
|
"Semi Detached": "Semi-Detached",
|
|
"Mid Terrace": "Mid-Terrace",
|
|
"End Terrace": "End-Terrace",
|
|
"Detached": "Detached",
|
|
"End Terraced": "End-Terrace",
|
|
}
|
|
},
|
|
"HA8": {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
"Bungalow": "Bungalow",
|
|
"Maisonette": "Maisonette",
|
|
"Bedsit": None,
|
|
"Room": None,
|
|
"Other": None,
|
|
"Commerical": None
|
|
},
|
|
"HA11": {
|
|
"Flat": "Flat",
|
|
"House": "House",
|
|
"Semi-Det House": "House",
|
|
"Bedsit": None,
|
|
"End-Terr House": "House",
|
|
"Mid-Terr House": "House",
|
|
"Bungalow": "Bungalow",
|
|
"Maisonette": "Maisonette",
|
|
"End Terr Flat": "Flat",
|
|
"Mid Terr Flat": "Flat",
|
|
"Detached Flat": "Flat",
|
|
},
|
|
"HA12": {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
"Bungalow": "Bungalow",
|
|
"Maisonette": "Maisonette",
|
|
"Bedsit": None,
|
|
},
|
|
"HA13": {
|
|
'House': "House",
|
|
'Flat': "Flat",
|
|
'House MT': "House",
|
|
'House SD': "House",
|
|
'House ET': "House",
|
|
'Bungalow MT': "Bungalow",
|
|
'Bungalow ET': "Bungalow",
|
|
'ii': None,
|
|
},
|
|
"HA14": {
|
|
"property_type": {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
"Bungalow": "Bungalow",
|
|
"Maisonette": "Maisonette",
|
|
}
|
|
},
|
|
"HA15": {
|
|
'House': 'House',
|
|
'Flat': 'Flat',
|
|
'Bungalow': 'Bungalow',
|
|
'Maisonette': 'Maisonette',
|
|
'Flat over garage': 'Flat',
|
|
},
|
|
"HA16": {
|
|
'Semi Detached Bungalow': {"property-type": "Bungalow", "built-form": "Semi-Detached"},
|
|
'Mid Terraced House': {"property-type": "House", "built-form": "Mid-Terrace"},
|
|
'End Terraced House': {"property-type": "House", "built-form": "End-Terrace"},
|
|
'Low Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"},
|
|
'Semi-Detached House': {"property-type": "House", "built-form": "Semi-Detached"},
|
|
'Detached Bungalow': {"property-type": "Bungalow", "built-form": "Detached"},
|
|
'End Terraced Bungalow': {"property-type": "Bungalow", "built-form": "End-Terrace"},
|
|
'Mid Terraced Bungalow': {"property-type": "Bungalow", "built-form": "Mid-Terrace"},
|
|
'Medium Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"},
|
|
'Detached House': {"property-type": "House", "built-form": "Detached"},
|
|
'Cottage Flat': {"property-type": "Flat", "built-form": "Semi-Detached"},
|
|
'Maisonette Medium Rise': {"property-type": "Flat", "built-form": "Mid-Terrace"},
|
|
'Maisonette Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"},
|
|
'End Terraced Town House': {"property-type": "House", "built-form": "End-Terrace"},
|
|
'Flat Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"},
|
|
'Mid Terraced Town House': {"property-type": "House", "built-form": "Mid-Terrace"},
|
|
},
|
|
"HA18": {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
"Bungalow": "Bungalow",
|
|
"Maisonette": "Maisonette",
|
|
"Bedsit": None,
|
|
"Shop": None,
|
|
"Hostel": None,
|
|
"Block": None,
|
|
},
|
|
"HA20": {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
'Sheltered Flat': "Flat",
|
|
'Maisonette': 'Maisonette',
|
|
'Bungalow': 'Bungalow',
|
|
'House. SD': 'House',
|
|
'House. MT': 'House',
|
|
'House. ET': 'House',
|
|
'Sheltered Bungalow': 'Bungalow',
|
|
'Guest Accomodation': None,
|
|
'Sheltered House': 'House',
|
|
'House. MT ': 'House',
|
|
'House. D': 'House'
|
|
},
|
|
"HA24": {
|
|
'01 HOUSE': 'House',
|
|
'02 FLAT': 'Flat',
|
|
'03 BUNGALOW': 'Bungalow',
|
|
'10 PBUNGALOW': 'Bungalow',
|
|
'01 HOUSE MID': 'House',
|
|
'13 SBUNGALOW': 'Bungalow',
|
|
'12 SBEDSIT': None, # BEDSIT does not match the specified property types
|
|
'14 SFLAT': 'Flat',
|
|
'05 BEDSIT': None,
|
|
'04 MAISONETTE': 'Maisonette',
|
|
'11 PFLAT': 'Flat',
|
|
'09 PBEDSIT': None
|
|
},
|
|
"HA25": {
|
|
'Flat': 'Flat',
|
|
'Mid Terrace House': 'House',
|
|
'Semi Detached House': 'House',
|
|
'End Terrace House': 'House',
|
|
'House': 'House',
|
|
'Semi Detached Bung': 'Bungalow',
|
|
'Bungalow': 'Bungalow',
|
|
'End Terrace Bungalow': 'Bungalow',
|
|
'Maisonnette': 'Maisonette',
|
|
'Mid Terrace Bungalow': 'Bungalow',
|
|
'Bedspace': None,
|
|
'Detached House': 'House',
|
|
'Bedsit': 'Flat',
|
|
'Coach House': 'House',
|
|
'Detached Bungalow': 'Bungalow',
|
|
'Office Buildings': None,
|
|
'Guest Room': None,
|
|
'Mid Terrace Housekeeping ': 'House',
|
|
'End Terrace Housex': 'House'
|
|
},
|
|
"HA28": {
|
|
'Flat': 'Flat',
|
|
'Semi detached house': 'House',
|
|
'Terraced house': 'House',
|
|
'Maisonette flat': 'Maisonette',
|
|
'Sheltered bedsit': None,
|
|
'APD flat': 'Flat',
|
|
'Bungalow terraced': 'Bungalow',
|
|
'Flat with partition': 'Flat',
|
|
'Bungalow semi detached': 'Bungalow',
|
|
'APD Bungalow': 'Bungalow',
|
|
'Sheltered flat': 'Flat',
|
|
'Bedsit Flat': 'Flat',
|
|
'Bedsit bungalow semi detached': 'Bungalow',
|
|
'Sheltered bungalow terraced': 'Bungalow',
|
|
'Sheltered bedsit disabled': None,
|
|
'Bedsit bungalow terraced': 'Bungalow',
|
|
'Sheltered bungalow semi detached': 'Bungalow',
|
|
'Sheltered warden flat': 'Flat',
|
|
'Bungalow detached': 'Bungalow',
|
|
'Block': None, # Does not match the specified property types
|
|
'End Terraced House': 'House',
|
|
'Mid Terraced House': 'House',
|
|
'#N/A': None, # Assuming this is an invalid or missing entry
|
|
0: None # Assuming 0 is also an invalid or missing entry
|
|
},
|
|
"HA30": {
|
|
'House': 'House',
|
|
'Flat': 'Flat',
|
|
'Bungalow': 'Bungalow',
|
|
'House with Attached Garage': 'House',
|
|
'Bed Space': None, # Assuming this does not fit the specified property types
|
|
'House with Garage': 'House',
|
|
'Bungalow with Wheelchair Access': 'Bungalow',
|
|
'Maisonette': 'Maisonette',
|
|
'Flat with Wheelchair Access': 'Flat',
|
|
'Bedsit': None, # Assuming this does not fit the specified property types
|
|
'Flat w Wheelchair Access & Car Park': 'Flat',
|
|
'House with Wheelchair Access': 'House',
|
|
'Bungalow w Wheelchair Access & Car ': 'Bungalow'
|
|
},
|
|
"HA32": {
|
|
'Bungalow': 'Bungalow',
|
|
'Flat': 'Flat',
|
|
'Bungalow Disabled': 'Bungalow', # "Disabled" properties categorized with their base type
|
|
'House': 'House',
|
|
'Dormer Bungalow': 'Bungalow',
|
|
'Pop-In': None, # Does not fit the specified property types
|
|
'Flat Disabled': 'Flat',
|
|
'Laundry': None, # Does not fit the specified property types
|
|
'Bedsit': None, # Excluded from the given categories
|
|
'Shed': None, # Does not fit the specified property types
|
|
'Store Room': None # Does not fit the specified property types
|
|
},
|
|
"HA34": {
|
|
'Flat': 'Flat',
|
|
'House': 'House',
|
|
'Bungalow': 'Bungalow',
|
|
'Maisonette': 'Maisonette',
|
|
'ND': None,
|
|
},
|
|
"HA35": {
|
|
"Flat": "Flat",
|
|
"Maisonette": "Maisonette",
|
|
"House": "House",
|
|
"Bedsit": None,
|
|
"2 Bedroom Unknown": None,
|
|
"1 Bedroom Unknown": None,
|
|
"3 Bedroom Unknown": None,
|
|
"4 Bedroom Unknown": None,
|
|
},
|
|
"HA37": {
|
|
"FLT": "Flat",
|
|
"HSE": "House",
|
|
"BNW": "Bungalow",
|
|
"MAS": "Maisonette",
|
|
"HSL": None
|
|
},
|
|
"HA39": {
|
|
"Semi house": {"property_type": "House", "built_form": "Semi-Detached"},
|
|
"1st floor flat": {"property_type": "Flat", "built_form": None},
|
|
"Mid terrace house": {"property_type": "House", "built_form": "Mid-Terrace"},
|
|
"Ground floor flat": {"property_type": "Flat", "built_form": None},
|
|
"End terrace house": {"property_type": "House", "built_form": "End-Terrace"},
|
|
"Semi bungalow": {"property_type": "Bungalow", "built_form": "Semi-Detached"},
|
|
"End terrace bungalow": {"property_type": "Bungalow", "built_form": "End-Terrace"},
|
|
"2nd floor flat": {"property_type": "Flat", "built_form": None},
|
|
"Mid terrace bungalow": {"property_type": "Bungalow", "built_form": "Mid-Terrace"},
|
|
"3rd floor flat": {"property_type": "Flat", "built_form": None},
|
|
"Detached bungalow": {"property_type": "Bungalow", "built_form": "Detached"},
|
|
"Maisonette": {"property_type": "Maisonette", "built_form": None},
|
|
"Detached house": {"property_type": "House", "built_form": "Detached"},
|
|
"Lower ground floor flat": {"property_type": "Flat", "built_form": None},
|
|
"Dormer bungalow": {"property_type": "Bungalow", "built_form": None},
|
|
"Basement flat": {"property_type": "Flat", "built_form": None},
|
|
"Cluster House": {"property_type": "House", "built_form": "Detached"},
|
|
"2nd/3rd floor duplex flat": {"property_type": "Flat", "built_form": None},
|
|
"Ground floor flat with study": {"property_type": "Flat", "built_form": None},
|
|
"4th floor flat": {"property_type": "Flat", "built_form": None},
|
|
"1st floor flat with study room": {"property_type": "Flat", "built_form": None},
|
|
"2nd floor flat with study": {"property_type": "Flat", "built_form": None},
|
|
},
|
|
"HA41": {
|
|
'Garage': None,
|
|
'House 1919-1945': 'House',
|
|
'House 1946-1964': 'House',
|
|
'Flats & Maisonettes post 1974': 'Flat',
|
|
'Non traditional houses': 'House',
|
|
'Sheltered': None,
|
|
'Flats & Maisonettes 1965-1974': 'Flat',
|
|
'House post 1974': 'House',
|
|
'Block': None,
|
|
'Flats & Maisonettes 1946-1964': 'Flat',
|
|
'House 1965-1974': 'House',
|
|
'Non traditional flats': 'Flat',
|
|
'Bungalow 1965-1974': 'Bungalow',
|
|
'PIMSS EMPTY': None,
|
|
'Bungalow post 1974': 'Bungalow',
|
|
'Bungalow 1946-1964': 'Bungalow',
|
|
'Flats & Maisonettes 1919-1945': 'Flat',
|
|
'House pre 1919': 'House',
|
|
'Flats & Maisonettes pre 1919': 'Flat',
|
|
'Bungalow 1919-1945': 'Bungalow',
|
|
'Office': None
|
|
},
|
|
"HA42": {
|
|
'Flat': 'Flat',
|
|
'House': 'House',
|
|
'Flat Basement': 'Flat',
|
|
'Room': None,
|
|
'Bedsit Flat': 'Flat',
|
|
'Maisonette': 'Maisonette',
|
|
'Scheme Office': None,
|
|
'Scheme Lounge': None,
|
|
'Bungalow': 'Bungalow',
|
|
'Garage': None,
|
|
'Scheme Sleep Room': None,
|
|
'Cluster': None,
|
|
'Scheme Room': None
|
|
},
|
|
"HA45": {
|
|
'Large block of flats': 'Flat',
|
|
'Small block of flats/dwelling converted in to flats': 'Flat',
|
|
'Semi-detached house': 'House',
|
|
'Mid-terraced house': 'House',
|
|
'End-terraced house': 'House',
|
|
'Block of flats': 'Flat',
|
|
'Detached house': 'House',
|
|
'Flat in mixed use building': 'Flat',
|
|
},
|
|
"HA48": {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
"Bungalow": "Bungalow",
|
|
"Maisonette": "Maisonette",
|
|
"Unit": None
|
|
},
|
|
"HA50": {
|
|
'House': 'House',
|
|
'Bungalow': 'Bungalow',
|
|
'Flat': 'Flat',
|
|
'House SD': 'House',
|
|
'House MT': 'House',
|
|
'House ET': 'House',
|
|
'Bungalow ET': 'Bungalow',
|
|
'House SD ': 'House',
|
|
'House. SD': 'House',
|
|
'Bungalow SD': 'Bungalow',
|
|
'Bungalow MT': 'Bungalow',
|
|
'Bungalow D': 'Bungalow',
|
|
'House D': 'House',
|
|
'House. MT': 'House',
|
|
'House ': 'House',
|
|
'House ET ': 'House',
|
|
' ': None,
|
|
'Flat?': 'Flat',
|
|
'Bungalow ': 'Bungalow'
|
|
},
|
|
"HA51": {
|
|
'FLAT': 'Flat',
|
|
'HOUSE': 'House',
|
|
'MAISONETTE': 'Maisonette',
|
|
'BEDSIT': None, # Considering as a non-specific residential category here
|
|
'BUNGALOW': 'Bungalow',
|
|
},
|
|
"HA52": {
|
|
'House - Mid Terrace': 'House',
|
|
'Flat - First Floor': 'Flat',
|
|
'Flat - Ground Floor': 'Flat',
|
|
'House - Semi-Detached': 'House',
|
|
'House - End Terrace': 'House',
|
|
'Flat - Second Floor': 'Flat',
|
|
'Bedsit': None, # Considering as a non-specific residential category here
|
|
'Bungalow - Semi-Detached': 'Bungalow',
|
|
'Bungalow - Mid Terrace': 'Bungalow',
|
|
'Bungalow - End Terrace': 'Bungalow',
|
|
'House - Detached': 'House',
|
|
'Flat - Third Floor': 'Flat',
|
|
'House attached to flats': 'House',
|
|
'Flat - Fourth Floor': 'Flat',
|
|
'Bungalow - Detached': 'Bungalow'
|
|
},
|
|
"HA56": {
|
|
'House Non Specific': 'House',
|
|
'HOUSE TERRACED': 'House',
|
|
'HOUSE - SEMI DETACHD': 'House',
|
|
'Bungalow': 'Bungalow',
|
|
'House - End Terraced': 'House',
|
|
'Block': None,
|
|
'Block with Communal': None,
|
|
'Bungalow - Terraced': 'Bungalow',
|
|
'Bungalow - Semi Dtch': 'Bungalow',
|
|
'Block House with rooms': None,
|
|
'Bungalow - End Terr': 'Bungalow',
|
|
'House - Mid Terraced': 'House',
|
|
'Bungalow - Detached': 'Bungalow',
|
|
'House - Detached': 'House',
|
|
'HOUSE THREE STOREY': 'House',
|
|
'Maisonette': 'Maisonette',
|
|
'Communal Block': None,
|
|
'Scheme': None
|
|
},
|
|
"HA63": {
|
|
'Flat': 'Flat',
|
|
'House - Semi detached': 'House',
|
|
'House - Detached': 'House',
|
|
'House - End Terrace': 'House',
|
|
'House - Mid Terrace': 'House',
|
|
'Bungalow - Semi detached': 'Bungalow',
|
|
'Bungalow': 'Bungalow',
|
|
'Bedsit': None, # Considering as a non-specific residential category here
|
|
'Maisonette': 'Maisonette',
|
|
'Bungalow - End Terrace': 'Bungalow',
|
|
'Bungalow - Detached': 'Bungalow',
|
|
'Maisonette - Mid Terrace': 'Maisonette',
|
|
'Maisonette - End Terrace': 'Maisonette',
|
|
'Studio Flat': 'Flat',
|
|
'Maisonette - Detached': 'Maisonette',
|
|
'Bungalow - Mid Terrace': 'Bungalow',
|
|
'Bedsit - Mid Terrace': None,
|
|
'Bedsit - End Terrace': None,
|
|
'Amenity Block - Semi detached': None, # Assuming non-residential
|
|
'Maisonette - Semi Detached': 'Maisonette',
|
|
'Amenity Block - Detached': None, # Assuming non-residential
|
|
'Hostel': None, # Typically not considered a standard residential property for this context
|
|
'Bungalow - Attached': 'Bungalow',
|
|
'Unknown': None, # Not enough information to categorize
|
|
'Studio Flat - Mid Terrace': 'Flat',
|
|
'Chalet - Wheelchair': None # Specialized type, not categorized here
|
|
},
|
|
"HA107": {
|
|
"property_type": {
|
|
"HOUSE": "House",
|
|
"BUNGALOW": "Bungalow",
|
|
"GRD FLOOR FLAT": "Flat",
|
|
"FIRST FLOOR FLAT": "Flat",
|
|
"SHELTERED BUNGALOW": "Bungalow",
|
|
"MAISONETTE": "Maisonette",
|
|
"SECOND FLOOR FLAT": "Flat",
|
|
"SHELTERED FIRST FLR": "Flat",
|
|
"SHELTERED GROUND FLR": "Flat",
|
|
"GRD FLOOR BED SIT": "House"
|
|
},
|
|
"built_form": {
|
|
"Semi Detached": "Semi-Detached",
|
|
"Mid Terrace": "Mid-Terrace",
|
|
"End Terrace": "End-Terrace",
|
|
"Detached": "Detached",
|
|
"Detatched": "Detached",
|
|
}
|
|
},
|
|
"HA117": {
|
|
"Flat": "Flat",
|
|
"House": "House",
|
|
"Bungalow": "Bungalow",
|
|
"Flat over garage/underpass": "Flat",
|
|
},
|
|
"HAXXX": {
|
|
'mid terraced house': 'House',
|
|
'semi detached house': 'House',
|
|
'1st fl 4 in a block': 'Flat',
|
|
'G/F 4 in a block': 'Flat',
|
|
'end terraced house': 'House',
|
|
'1st floor flat': 'Flat',
|
|
'G/F floor flat': 'Flat',
|
|
'semi detached bungalow': 'Bungalow',
|
|
'2nd floor flat': 'Flat',
|
|
'mid terrace bungalow': 'Bungalow',
|
|
'detached bungalow': 'Bungalow',
|
|
'end terrace bungalow': 'Bungalow',
|
|
'Staff accommodation': None # Marked as None due to its special nature
|
|
}
|
|
}
|
|
|
|
|
|
class DataLoader:
|
|
COLUMN_CONFIG = {
|
|
"HA1": {
|
|
"address": "Address",
|
|
"postcode": "Address - Postcode"
|
|
},
|
|
"HA5": {
|
|
"address": "Address",
|
|
"postcode": "matching_postcode"
|
|
},
|
|
"HA6": {
|
|
"address": "propertyaddress",
|
|
"postcode": "address" # The 'address' column actually contains postcode
|
|
},
|
|
"HA12": {
|
|
"address": "Full Address",
|
|
"postcode": "Postcode"
|
|
},
|
|
"HA16": {
|
|
"address": "Address",
|
|
"postcode": "Postcode"
|
|
},
|
|
"HA24": {
|
|
"address": "Address",
|
|
"postcode": "Postcode"
|
|
},
|
|
"HA25": {
|
|
"address": "T1_Address",
|
|
"postcode": "matching_postcode"
|
|
},
|
|
"HA30": {
|
|
"address": "A_Address",
|
|
"postcode": "A_Postcode"
|
|
},
|
|
"HA31": {
|
|
"address": "A_Address",
|
|
"postcode": "matching_postcode"
|
|
},
|
|
"HA45": {
|
|
"address": "Full postal address",
|
|
"postcode": "Postcode"
|
|
},
|
|
"HA48": {
|
|
"address": "Full Address",
|
|
"postcode": "Postcode"
|
|
},
|
|
"HA49": {
|
|
"address": "Property Address Full",
|
|
"postcode": "Property Postcode"
|
|
},
|
|
"HA52": {
|
|
"address": "Postal Address",
|
|
"postcode": "POSTCODE"
|
|
},
|
|
"HA54": {
|
|
"address": "Postal Address",
|
|
"postcode": "matching_postcode"
|
|
}
|
|
}
|
|
|
|
UNMATCHED_CIGA = {
|
|
"HA2": 0,
|
|
"HA6": 117,
|
|
"HA9": 0,
|
|
"HA12": 6,
|
|
"HA13": 119,
|
|
"HA14": 3,
|
|
"HA15": 3,
|
|
"HA16": 7,
|
|
"HA24": 12,
|
|
"HA50": 4,
|
|
"HA63": 15,
|
|
"HA107": 51,
|
|
"HA48": 0,
|
|
"HA45": 0,
|
|
"HA52": 5,
|
|
"HA20": 6
|
|
}
|
|
|
|
UNMATCHED_ECO3 = {
|
|
"HA25": 154,
|
|
"HA41": 26,
|
|
"HA50": 5,
|
|
"HA56": 320,
|
|
"HA63": 0,
|
|
"HA117": 4,
|
|
"HA51": 24
|
|
}
|
|
|
|
def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
|
|
self.directories = directories
|
|
self.use_cache = use_cache
|
|
self.december_figures_filepath = december_figures_filepath
|
|
self.rebuild = rebuild
|
|
|
|
self.data = {}
|
|
self.december_figures = None
|
|
self.facts_and_figures = None
|
|
|
|
def create_asset_list_matching_address(self, ha_name, asset_list):
|
|
|
|
if ha_name in [
|
|
"HA1", "HA5", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA52", "HA54"
|
|
]:
|
|
asset_list["matching_address"] = asset_list[
|
|
self.COLUMN_CONFIG[ha_name]["address"]
|
|
].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list[
|
|
self.COLUMN_CONFIG[ha_name]["postcode"]
|
|
].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA2":
|
|
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
|
|
asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA7":
|
|
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
|
|
asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA8":
|
|
asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA9":
|
|
asset_list["matching_address"] = asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA11":
|
|
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Post Code"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA13":
|
|
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["address 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA14":
|
|
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
|
|
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 4"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA15":
|
|
asset_list["matching_address"] = (
|
|
asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA18":
|
|
asset_list["matching_address"] = (
|
|
asset_list["Address"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Post Code"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA19":
|
|
asset_list["matching_address"] = (
|
|
asset_list["Address1"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address2"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address3"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA20":
|
|
asset_list["matching_address"] = (
|
|
asset_list["House Name"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Block"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA21":
|
|
asset_list["matching_address"] = (
|
|
asset_list["Address"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["PostCode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA25":
|
|
asset_list["matching_address"] = asset_list[
|
|
self.COLUMN_CONFIG[ha_name]["address"]
|
|
].astype(str).str.lower().str.strip()
|
|
|
|
asset_list["matching_postcode"] = asset_list['matching_address'].apply(
|
|
lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x
|
|
)
|
|
elif ha_name == "HA27":
|
|
asset_list["matching_address"] = (
|
|
asset_list[" Address"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list[" Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA28":
|
|
asset_list["matching_address"] = (
|
|
asset_list["House Number"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Street 1"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA32":
|
|
asset_list["matching_address"] = (
|
|
asset_list["Dwelling num"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Street"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA33":
|
|
asset_list["matching_address"] = (
|
|
asset_list["ADDRESS"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["POST CODE"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["POST CODE"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA34":
|
|
asset_list["matching_address"] = (
|
|
asset_list[" Address"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list[" Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA35":
|
|
asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Post Code"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Address Post Code"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA37":
|
|
asset_list["matching_address"] = asset_list["ADDRESS LINE 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["ADDRESS LINE 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["ADDRESS LINE 3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["POSTCODE"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA38":
|
|
asset_list["matching_address"] = asset_list["House_Number"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address_Line_1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address_Line_2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address_Line_3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA39":
|
|
# Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
|
|
asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["add_2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["add_3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["post_code"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["post_code"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA41":
|
|
asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["AddressLine3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["AddressLine4"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["AddressLine5"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA42":
|
|
asset_list["matching_address"] = asset_list["Dwelling Number"].astype(str).str.lower().str.strip() + " " + \
|
|
asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Locality"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA44":
|
|
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postal Code"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postal Code"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA50":
|
|
asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Post Code"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA51":
|
|
asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_address"] = np.where(
|
|
asset_list["Block"].str.strip().str.len() > 0,
|
|
asset_list["Block"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["matching_address"],
|
|
asset_list["matching_address"]
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA56":
|
|
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Post Code"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA63":
|
|
asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["POSTCODE"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA70":
|
|
asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["POSTCODE"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA107":
|
|
# Create matching_address by concatenating House No, Street, Town, District, Postcode
|
|
asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["District"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HA117":
|
|
asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["PostCode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HAXX":
|
|
asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
|
|
asset_list["PostCode"].astype(str).str.lower().str.strip()
|
|
asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
|
|
elif ha_name == "HAXXX":
|
|
asset_list["matching_address"] = (
|
|
asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
|
else:
|
|
raise NotImplementedError("implement me")
|
|
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def extract_property_info_ha107(properties):
|
|
property_types = {
|
|
"House": "House",
|
|
"Flat": "Flat",
|
|
"Bungalow": "Bungalow",
|
|
"Maisonette": "Maisonette",
|
|
"Bedsit": None
|
|
}
|
|
|
|
built_forms = {
|
|
"Detached": "Detached",
|
|
"Semi Detached": "Semi-Detached",
|
|
"End Terrace": "End-Terrace",
|
|
"Mid Terrace": "Mid-Terrace"
|
|
}
|
|
|
|
# Function to extract property type and built form from a description
|
|
def extract_from_description(description):
|
|
property_type = None
|
|
built_form = None
|
|
|
|
for key in property_types:
|
|
if key in description:
|
|
property_type = property_types[key]
|
|
break
|
|
|
|
for key in built_forms:
|
|
if key in description:
|
|
built_form = built_forms[key]
|
|
break
|
|
|
|
return property_type, built_form
|
|
|
|
# Process each property in the list
|
|
results = []
|
|
for property_description in properties:
|
|
property_type, built_form = extract_from_description(property_description)
|
|
results.append(
|
|
{
|
|
"Property type": property_description,
|
|
"property_type": property_type,
|
|
"built_form": built_form
|
|
}
|
|
)
|
|
results = pd.DataFrame(results)
|
|
|
|
return results
|
|
|
|
def append_asset_list_built_form(self, ha_name, asset_list):
|
|
|
|
# Finally, we process property_type or built form, where needed
|
|
if ha_name == "HA6":
|
|
asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6)
|
|
|
|
if ha_name == "HA107":
|
|
mapped_df = self.extract_property_info_ha107(asset_list["Property type"].unique())
|
|
asset_list = asset_list.merge(
|
|
mapped_df, how="left", on="Property type"
|
|
)
|
|
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def create_asset_list_house_no(ha_name, asset_list):
|
|
"""
|
|
This function will append the House number onto the asset list
|
|
:return:
|
|
"""
|
|
|
|
if ha_name == "HA107":
|
|
asset_list["HouseNo"] = asset_list["House No"].copy()
|
|
elif ha_name == "HA32":
|
|
asset_list["HouseNo"] = asset_list["Dwelling num"].copy()
|
|
elif ha_name == "HA28":
|
|
asset_list["HouseNo"] = asset_list["House Number"].copy()
|
|
elif ha_name == "HA38":
|
|
asset_list["HouseNo"] = asset_list["House_Number"].copy()
|
|
elif ha_name == "HA9":
|
|
asset_list["HouseNo"] = asset_list["House Number"].copy()
|
|
elif ha_name == "HAXXX":
|
|
asset_list["HouseNo"] = asset_list["Door Number"].copy()
|
|
else:
|
|
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
|
|
house_numbers = split_addresses[0].str.split(' ', expand=True)
|
|
# If we have "flat" or valley" as the house number, then the house number is actually in the second column
|
|
house_numbers[0] = np.where(house_numbers[0].isin(["flat", "valley"]), house_numbers[1], house_numbers[0])
|
|
|
|
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
|
|
# many columns there might be
|
|
house_numbers = house_numbers.iloc[:, 0:1]
|
|
house_numbers.columns = ['HouseNo']
|
|
|
|
# Remove trailing punctuation such as , or ;
|
|
house_numbers["HouseNo"] = house_numbers["HouseNo"].str.rstrip(',;')
|
|
|
|
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
|
|
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def create_ciga_list_house_no(ciga_list):
|
|
"""
|
|
This function will append the House number onto the asset list
|
|
:return:
|
|
"""
|
|
|
|
split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
|
|
house_numbers = split_addresses[0].str.split(' ', expand=True)
|
|
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
|
|
# many columns there might be
|
|
house_numbers = house_numbers.iloc[:, 0:1]
|
|
house_numbers.columns = ['HouseNo']
|
|
|
|
ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1)
|
|
|
|
return ciga_list
|
|
|
|
@staticmethod
|
|
def dedupe_ciga_list(ciga_list):
|
|
ciga_list["unique_key"] = ciga_list["Matched Address"] + ciga_list["Matched Postcode"]
|
|
# Remove spaces from the unique key
|
|
ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(" ", "")
|
|
# Remove punctuation from the unique key
|
|
ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(r'[^\w\s]', '')
|
|
# Drop duplicated keys
|
|
ciga_list = ciga_list[~ciga_list["unique_key"].duplicated()]
|
|
return ciga_list
|
|
|
|
@staticmethod
|
|
def get_asset_sheetname(workbook):
|
|
if "Asset List" in workbook.sheetnames:
|
|
return "Asset List"
|
|
elif "Asset list" in workbook.sheetnames:
|
|
return "Asset list"
|
|
elif "Asset" in workbook.sheetnames and "Assets" not in workbook.sheetnames:
|
|
return "Asset"
|
|
elif "Decent Homes Stock" in workbook.sheetnames:
|
|
return "Decent Homes Stock"
|
|
elif "Report" in workbook.sheetnames:
|
|
return "Report"
|
|
else:
|
|
return "Assets"
|
|
|
|
@staticmethod
|
|
def get_ciga_sheetname(workbook):
|
|
|
|
if "CIGA Checks" in workbook.sheetnames:
|
|
return "CIGA Checks"
|
|
elif "CIGA checks" in workbook.sheetnames:
|
|
return "CIGA checks"
|
|
elif "CIGA check" in workbook.sheetnames:
|
|
return "CIGA check"
|
|
elif "CIGA Check" in workbook.sheetnames:
|
|
return "CIGA Check"
|
|
elif "CIGA requested" in workbook.sheetnames:
|
|
return "CIGA requested"
|
|
else:
|
|
return "CIGA"
|
|
|
|
@staticmethod
|
|
def get_survey_sheetname(workbook):
|
|
if "ECO Surveys" in workbook.sheetnames:
|
|
return "ECO Surveys"
|
|
elif "ECO Survey" in workbook.sheetnames:
|
|
return "ECO Survey"
|
|
elif "ECO 4 Surveys completed" in workbook.sheetnames:
|
|
return "ECO 4 Surveys completed"
|
|
elif "ECO4 Surveys" in workbook.sheetnames:
|
|
return "ECO4 Surveys"
|
|
else:
|
|
return "ECO surveys"
|
|
|
|
@staticmethod
|
|
def correct_ha51_asset_list(asset_list):
|
|
# Correct this
|
|
asset_list["HouseNo"] = np.where(
|
|
asset_list["matching_address"].str.contains("61 wandle bank"),
|
|
asset_list["Block"].str.lower(),
|
|
asset_list["HouseNo"]
|
|
)
|
|
|
|
return asset_list
|
|
|
|
def prepare_ha17(self, workbook):
|
|
blocks_sheet = workbook["Blocks List - Cavity Wall only"]
|
|
blocks_data = []
|
|
blocks_colnames = [cell.value for cell in blocks_sheet[2]]
|
|
for row in blocks_sheet.iter_rows(min_row=4, values_only=False):
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
blocks_data.append(row_data)
|
|
|
|
blocks_df = pd.DataFrame(blocks_data, columns=blocks_colnames)
|
|
|
|
blocks_df["matching_address"] = (
|
|
blocks_df["Block Name\n[as per Naming Convention procedure]"].astype(str).str.lower().str.strip() + ", " +
|
|
blocks_df["Block Street Name"].astype(str).str.lower().str.strip() + ", " +
|
|
blocks_df["Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
blocks_df["matching_postcode"] = blocks_df["Postcode"].astype(str).str.lower().str.strip()
|
|
blocks_df["property_type"] = "Flat"
|
|
|
|
street_properties_sheet = workbook["Street Properties - Cavity Wall"]
|
|
street_properties_data = []
|
|
street_properties_colnames = [cell.value for cell in street_properties_sheet[2]]
|
|
for row in street_properties_sheet.iter_rows(min_row=3, values_only=False):
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
street_properties_data.append(row_data)
|
|
|
|
street_properties_df = pd.DataFrame(street_properties_data, columns=street_properties_colnames)
|
|
|
|
street_properties_df["matching_address"] = (
|
|
street_properties_df["Block Name\n[as per Naming Convention procedure]"].astype(
|
|
str).str.lower().str.strip() + ", " +
|
|
street_properties_df["Postcode"].astype(str).str.lower().str.strip()
|
|
)
|
|
street_properties_df["matching_postcode"] = street_properties_df["Postcode"].astype(str).str.lower().str.strip()
|
|
street_properties_df["property_type"] = street_properties_df[
|
|
"Block typology based on dwelling type\n[defined list]"
|
|
]
|
|
|
|
asset_list_compressed = pd.concat(
|
|
[
|
|
blocks_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]],
|
|
street_properties_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]]
|
|
],
|
|
axis=0
|
|
)
|
|
# We expand
|
|
range_pattern = r"(\d+)\s+to\s+(\d+)\s+(.*)"
|
|
asset_list = []
|
|
for _, row in tqdm(asset_list_compressed.iterrows(), total=len(asset_list_compressed)):
|
|
if row["ECO Eligibility"] == "Not Eligible":
|
|
asset_list.append(row.to_dict())
|
|
continue
|
|
|
|
# Detect a house number range
|
|
match = re.search(range_pattern, row["matching_address"])
|
|
|
|
if not match:
|
|
asset_list.append(row.to_dict())
|
|
continue
|
|
|
|
# Extracting the start and end of the range
|
|
start_number = int(match.group(1))
|
|
end_number = int(match.group(2))
|
|
rest_of_address = match.group(3)
|
|
|
|
# Generating the list of house numbers
|
|
house_numbers = list(range(start_number, end_number + 1))
|
|
data_to_extend = []
|
|
for house_number in house_numbers:
|
|
new_adress = f"{house_number} {rest_of_address}"
|
|
|
|
entry = row.to_dict().copy()
|
|
entry.update({"matching_address": new_adress})
|
|
|
|
data_to_extend.append(entry)
|
|
|
|
asset_list.extend(data_to_extend)
|
|
|
|
asset_list = pd.DataFrame(asset_list)
|
|
|
|
# Add in asset_list_row_id
|
|
asset_list["asset_list_row_id"] = ["HA17" + str(i) for i in range(0, len(asset_list))]
|
|
|
|
# Add on house number
|
|
asset_list = self.create_asset_list_house_no(ha_name="HA17", asset_list=asset_list)
|
|
|
|
return asset_list
|
|
|
|
def load_asset_list(self, filepath, ha_name):
|
|
workbook = openpyxl.load_workbook(filepath)
|
|
if ha_name == "HA17":
|
|
asset_list = self.prepare_ha17(workbook)
|
|
return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
else:
|
|
asset_sheetname = self.get_asset_sheetname(workbook)
|
|
|
|
asset_sheet = workbook[asset_sheetname]
|
|
asset_sheet_colnames = [cell.value for cell in asset_sheet[1]]
|
|
if ha_name == "HA25":
|
|
asset_sheet_colnames[11] = "matching_postcode"
|
|
|
|
if ha_name == "HA31":
|
|
asset_sheet_colnames[2] = "matching_postcode"
|
|
|
|
if ha_name == "HA54":
|
|
asset_sheet_colnames[10] = "matching_postcode"
|
|
|
|
if ha_name == "HA5":
|
|
asset_sheet_colnames[2] = "matching_postcode"
|
|
|
|
rows_data = []
|
|
|
|
for row in asset_sheet.iter_rows(min_row=2, values_only=False):
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
rows_data.append(row_data)
|
|
|
|
asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames)
|
|
|
|
asset_list = asset_list.loc[:, asset_list.columns.notnull()]
|
|
|
|
# Remove entirely empty rows - consider all rows apart from row_color
|
|
asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)]
|
|
|
|
# Add in asset_list_row_id
|
|
asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))]
|
|
|
|
# Create matching address and matching postcode
|
|
asset_list = self.create_asset_list_matching_address(ha_name=ha_name, asset_list=asset_list)
|
|
|
|
asset_list = self.create_asset_list_house_no(ha_name=ha_name, asset_list=asset_list)
|
|
|
|
asset_list = self.append_asset_list_built_form(ha_name=ha_name, asset_list=asset_list)
|
|
|
|
# We correct the asset list if it needs it
|
|
# Correct the asset list
|
|
correction_function_name = f"correct_{ha_name.lower()}_asset_list"
|
|
if hasattr(self, correction_function_name):
|
|
asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
|
|
asset_list = asset_list_correction_function(asset_list)
|
|
|
|
# For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga
|
|
# lists, and so
|
|
# we can return the asset list now
|
|
if ha_name in ["HA1", "HA27"]:
|
|
return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
|
|
# If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be
|
|
# suitable under ECO4, since their walls will be filled
|
|
eco3_list = pd.DataFrame()
|
|
sheetnames_lower = [x.lower() for x in workbook.sheetnames]
|
|
eco3_sheetname_index = [i for i, x in enumerate(sheetnames_lower) if "eco3" in x.replace(" ", "")]
|
|
if eco3_sheetname_index:
|
|
eco3_sheetname = workbook.sheetnames[eco3_sheetname_index[0]]
|
|
eco3_sheet = workbook[eco3_sheetname]
|
|
eco3_rows = []
|
|
for row in eco3_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
eco3_rows.append(row_data)
|
|
|
|
eco3_list = pd.DataFrame(eco3_rows, columns=[cell.value for cell in eco3_sheet[1]])
|
|
# Remove columns that are None
|
|
eco3_list = eco3_list.loc[:, eco3_list.columns.notnull()]
|
|
# Remove rows that are completely empty
|
|
eco3_list = eco3_list.loc[eco3_list.loc[:, eco3_list.columns].notnull().any(axis=1)]
|
|
eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))]
|
|
|
|
# Perform the eco3 merge
|
|
if not eco3_list.empty:
|
|
eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name)
|
|
|
|
if ha_name in ["HA25"]:
|
|
# Accomodate ha25 unique structure
|
|
return asset_list, pd.DataFrame(), pd.DataFrame(), eco3_list
|
|
|
|
# We check if there is a survey list
|
|
survey_sheetname = self.get_survey_sheetname(workbook)
|
|
survey_sheet = workbook[survey_sheetname]
|
|
survey_rows = []
|
|
for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
survey_rows.append(row_data)
|
|
|
|
survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
|
|
# Remove columns that are None
|
|
survey_list = survey_list.loc[:, survey_list.columns.notnull()]
|
|
# Remove rows that are completely empty
|
|
survey_list = survey_list.loc[survey_list.loc[:, survey_list.columns].notnull().any(axis=1)]
|
|
survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))]
|
|
|
|
# Perform survey list merge
|
|
if not survey_list.empty:
|
|
survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name)
|
|
|
|
# We check if there are CIGA checks
|
|
ciga_sheetname = self.get_ciga_sheetname(workbook)
|
|
ciga_sheet = workbook[ciga_sheetname]
|
|
ciga_rows = []
|
|
for row in ciga_sheet.iter_rows(min_row=2, values_only=False):
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
ciga_rows.append(row_data)
|
|
|
|
ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
|
|
# Remove columns that are None
|
|
ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
|
|
# Remove rows that are completely None
|
|
ciga_list = ciga_list.loc[ciga_list.loc[:, ciga_list.columns].notnull().any(axis=1)]
|
|
# Perform ciga list merge
|
|
if not ciga_list.empty:
|
|
# Remove rows with missing postcode which happens in a small number of cases
|
|
ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
|
|
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
|
|
ciga_list = self.create_ciga_list_house_no(ciga_list)
|
|
ciga_list = self.dedupe_ciga_list(ciga_list)
|
|
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
|
|
|
|
return asset_list, survey_list, ciga_list, eco3_list
|
|
|
|
@staticmethod
|
|
def correct_ha6_asset_list(asset_list):
|
|
|
|
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place")
|
|
asset_list["matching_address"] = asset_list["matching_address"].str.replace("baggott place", "baggotts place")
|
|
|
|
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree")
|
|
asset_list["matching_address"] = asset_list["matching_address"].str.replace("cherry tree", "cherrytree")
|
|
|
|
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close")
|
|
asset_list["matching_address"] = asset_list["matching_address"].str.replace("maryhill close", "mary hill close")
|
|
|
|
asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way")
|
|
asset_list["matching_address"] = asset_list["matching_address"].str.replace("moffat way", "moffatt way")
|
|
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def correct_ha56_asset_list(asset_list):
|
|
# CH1 4JR has already been surveyed, but it's listed in the asset list
|
|
# as a single row, when it's actually 32 units, so we just set this
|
|
# as ineligible
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["Post Code"] == "CH1 4JR",
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
# Same for CW8 3EU
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["Post Code"] == "CW8 3EU",
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["Post Code"] == "CW1 3HP",
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["Post Code"] == "WA4 2PH",
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["Post Code"] == "BD6 1QJ",
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["Post Code"] == "L39 1RS",
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["Post Code"] == "WA10 2DE",
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
# Already surveyed under ECO4
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["Post Code"] == "SK17 6NR",
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
((asset_list["Post Code"] == "WA5 0EN") &
|
|
(asset_list["Address 1"] == "Block 17-26 Tavlin Avenue")),
|
|
"Not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def correct_ha14_asset_list(asset_list):
|
|
|
|
# For 5 Queens Court, DE72 3NP, the postcode is actually DE72 3QZ
|
|
asset_list.loc[
|
|
(asset_list["Address 1"] == "5 Queens Court") &
|
|
(asset_list["Postcode"].str.strip() == "DE72 3NP"),
|
|
"matching_postcode"
|
|
] = "DE72 3QZ"
|
|
|
|
# We then correct the matching_address
|
|
asset_list.loc[
|
|
(asset_list["Address 1"] == "5 Queens Court") &
|
|
(asset_list["Postcode"].str.strip() == "DE72 3NP"),
|
|
"matching_address"
|
|
] = "5 queens court, garfield avenue, draycott, derby, de72 3qz"
|
|
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def correct_ha15_asset_list(asset_list):
|
|
asset_list["matching_postcode"] = np.where(
|
|
asset_list["Address Line 1"] == "103 Priory Crescent",
|
|
"hp19 9ny",
|
|
asset_list["matching_postcode"]
|
|
)
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def correct_ha32_asset_list(asset_list):
|
|
asset_list["Postcode"] = np.where(
|
|
(asset_list["Street"] == "Norton Grove") & (asset_list["Postcode"] == "HU4 6HQ") & (
|
|
asset_list["Dwelling num"] == "7"),
|
|
"hu4 6hg",
|
|
asset_list["Postcode"]
|
|
)
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def correct_ha38_asset_list(asset_list):
|
|
# For Kingsford court, the house number is at the end of the address
|
|
def rearrange_address_if_flat(address):
|
|
if '/flat' in address.lower():
|
|
parts = address.split('/flat', 1)
|
|
return f"FLAT{parts[1]}, {parts[0]}"
|
|
return address
|
|
|
|
def extract_house_no_if_flat(address):
|
|
if '/flat' in address.lower():
|
|
# Attempt to extract the house number following "/flat"
|
|
try:
|
|
house_no = address.split('/flat ')[1].split(' ')[0]
|
|
# Remove trailing comma
|
|
house_no = house_no.replace(",", "")
|
|
except IndexError:
|
|
house_no = None
|
|
return house_no
|
|
return None
|
|
|
|
asset_list['ExtractedHouseNo'] = asset_list['matching_address'].apply(extract_house_no_if_flat)
|
|
asset_list.loc[asset_list['ExtractedHouseNo'].notnull(), 'HouseNo'] = asset_list['ExtractedHouseNo']
|
|
asset_list['matching_address'] = asset_list['matching_address'].apply(rearrange_address_if_flat)
|
|
|
|
# We update a few specific rows
|
|
asset_list["HouseNo"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/ROOM A1",
|
|
"10 SOUTH VIEW/ROOM A2",
|
|
"10 SOUTH VIEW/ROOM A3",
|
|
]
|
|
)),
|
|
"10A",
|
|
asset_list["HouseNo"]
|
|
)
|
|
|
|
asset_list["matching_address"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/ROOM A1",
|
|
]
|
|
)),
|
|
"10a, 10 south view/room a1, spennymoor, co. durham, dl16 7df'",
|
|
asset_list["matching_address"]
|
|
)
|
|
|
|
asset_list["HouseNo"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/ROOM B1",
|
|
"10 SOUTH VIEW/ROOM B2",
|
|
"10 SOUTH VIEW/ROOM B3",
|
|
"10 SOUTH VIEW/ROOM B4",
|
|
]
|
|
)),
|
|
"10B",
|
|
asset_list["HouseNo"]
|
|
)
|
|
|
|
asset_list["matching_address"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/ROOM B1",
|
|
]
|
|
)),
|
|
"10b, 10 south view/room b1, spennymoor, co. durham, dl16 7df",
|
|
asset_list["matching_address"]
|
|
)
|
|
|
|
asset_list["HouseNo"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/FLAT C",
|
|
]
|
|
)),
|
|
"10C",
|
|
asset_list["HouseNo"]
|
|
)
|
|
|
|
asset_list["matching_address"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/FLAT C",
|
|
]
|
|
)),
|
|
"FLAT c, spennymoor, co. durham, dl16 7df, 10c, 10 south view",
|
|
asset_list["matching_address"]
|
|
)
|
|
|
|
asset_list["HouseNo"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/FLAT D",
|
|
]
|
|
)),
|
|
"10D",
|
|
asset_list["HouseNo"]
|
|
)
|
|
|
|
asset_list["matching_address"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/FLAT D",
|
|
]
|
|
)),
|
|
"FLAT d, spennymoor, co. durham, dl16 7df, 10d, 10 south view",
|
|
asset_list["matching_address"]
|
|
)
|
|
|
|
asset_list["HouseNo"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/FLAT E",
|
|
]
|
|
)),
|
|
"10E",
|
|
asset_list["HouseNo"]
|
|
)
|
|
|
|
asset_list["matching_address"] = np.where(
|
|
(asset_list["Address_Line_1"].isin(
|
|
[
|
|
"10 SOUTH VIEW/FLAT E",
|
|
]
|
|
)),
|
|
'FLAT e, spennymoor, co. durham, dl16 7df, 10e, 10 south view',
|
|
asset_list["matching_address"]
|
|
)
|
|
|
|
return asset_list
|
|
|
|
@staticmethod
|
|
def correct_ha6_survey_list(survey_list):
|
|
|
|
# Correct the survey list
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Seabridge Road", "Seabridge Lane"
|
|
)
|
|
|
|
# Strip out /KNUTTON from the street name
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/KNUTTON", "")
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Clevend Road", "Cleveland Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"TURNERS AVENUE", "Turner Avenue"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"WEDGEWWOD AVENUE", "Wedgwood Avenue"
|
|
)
|
|
# The cherrytree record has wrong postcode
|
|
survey_list.loc[survey_list["Street / Block Name"] == "Cherrytree road", "Post Code"] = "ST5 7BP"
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"MONUMENT RD", "Monument Road"
|
|
)
|
|
|
|
# Generally replace " RD" with " Road"
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" RD", " Road")
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"HILARY Road", "Hillary Road"
|
|
)
|
|
|
|
# Remove full stops from the street name
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(".", "")
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Chatworth road", "Chatsworth Place"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Wood Croft", "Woodcroft"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Milstone Avenue", "Millstone Avenue"
|
|
)
|
|
|
|
# Strip out /TALKE from the street name
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/TALKE", "")
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Woodcutts Street", "Woodshutts Street"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"HILLARY AVENUE", "Hillary Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"HILLARY AVENUE", "Hillary Road"
|
|
)
|
|
|
|
# Replace " Rd" with " Road"
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" Rd", " Road")
|
|
|
|
# We have a record listed as 19, MAPLE AVENUE ST7 1JX, when it should be 19, Hollins Crescent ST7 1JX
|
|
survey_list.loc[
|
|
(survey_list["Street / Block Name"] == "MAPLE AVENUE") &
|
|
(survey_list["NO."].isin([19])) &
|
|
(survey_list["Post Code"] == "ST7 1JX"),
|
|
"Street / Block Name"
|
|
] = "Hollins Crescent"
|
|
|
|
# However, some of the maple avenue records, are indeed Maple avenue, but are listed with the wrong postcode.
|
|
# E.g. number 26
|
|
survey_list.loc[
|
|
(survey_list["Street / Block Name"] == "MAPLE AVENUE") &
|
|
(survey_list["NO."].isin([26])) &
|
|
(survey_list["Post Code"] == "ST7 1JX"),
|
|
"Post Code"
|
|
] = "ST7 1JW"
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"BURSLEY Road", "Bursley Way"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Brittania Avenue", "Brittain Avenue"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Hawthorn Road", "Hawthorne Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Eastdale Place", "Easdale Place"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Wedgewood Road", "Wedgwood Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Droitwich Drive", "Droitwich Close"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Longdale Road", "Langdale Road"
|
|
)
|
|
|
|
# We have 2 addresses in the survey list that don't have postcodes. We'll manually add them in
|
|
survey_list.loc[
|
|
(survey_list["Street / Block Name"] == "Rogers Avenue") &
|
|
pd.isnull(survey_list["Post Code"]),
|
|
"Post Code"
|
|
] = "ST5 9AT"
|
|
|
|
survey_list.loc[
|
|
(survey_list["Street / Block Name"] == "Cedar Road") &
|
|
pd.isnull(survey_list["Post Code"]),
|
|
"Post Code"
|
|
] = "ST5 7BY"
|
|
|
|
# PERFORM ADDITIONAL DROPS
|
|
# Dropping rows based on multiple conditions
|
|
conditions_to_drop = [
|
|
(survey_list['Street / Block Name'] == "Bedford Crescent") & (survey_list['Post Code'] == "ST5 3EH") & (
|
|
survey_list['NO.'] == 23) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
|
|
(survey_list['Street / Block Name'] == "Hereford Avenue") & (survey_list['Post Code'] == "ST5 3EJ") & (
|
|
survey_list['NO.'] == 92) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
|
|
(survey_list['Street / Block Name'] == "Seabridge Lane") & (survey_list['Post Code'] == "ST5 3EX") & (
|
|
survey_list['NO.'].isin([16, 18, 42])) & (
|
|
survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
|
|
(survey_list['Street / Block Name'] == "ESKDALE PLACE") & (survey_list['Post Code'] == "ST5 3QW") & (
|
|
survey_list['NO.'] == 5) & (survey_list['SUBMISSION DATE'].astype(str) == "2023-03-06 00:00:00"),
|
|
(survey_list['Street / Block Name'] == "Birch House road") & (survey_list['Post Code'] == "ST6 2LS") & (
|
|
survey_list['NO.'].isin([56, 58])),
|
|
(survey_list['Street / Block Name'] == "Blackthorn Place") & (survey_list['Post Code'] == "ST6 2LS") & (
|
|
survey_list['NO.'].isin([37, 39])),
|
|
(survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 7BT") & (
|
|
survey_list['NO.'].isin([17, 6])),
|
|
(survey_list['Street / Block Name'] == "Lion Grove") & (survey_list['Post Code'] == "ST5 7HQ") & (
|
|
survey_list['NO.'].isin([10, 12])) & (
|
|
survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
|
|
(survey_list['Street / Block Name'] == "DENRY CRESCENT") & (survey_list['Post Code'] == "ST5 8JW") & (
|
|
survey_list['NO.'] == 87) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")),
|
|
(survey_list['Street / Block Name'] == "HOLLINS CRESCENT") & (survey_list['Post Code'] == "ST7 1JW") & (
|
|
survey_list['NO.'] == 19)
|
|
]
|
|
|
|
# Combine all conditions with an OR "|"
|
|
combined_condition = np.logical_or.reduce(conditions_to_drop)
|
|
|
|
# Drop rows that meet the combined condition
|
|
survey_list = survey_list[~combined_condition]
|
|
|
|
# Making replacements using np.where
|
|
survey_list['Post Code'] = np.where(
|
|
(survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3EH") & (
|
|
survey_list['NO.'] == 17),
|
|
"ST5 7BT",
|
|
survey_list['Post Code']
|
|
)
|
|
|
|
survey_list['Post Code'] = np.where(
|
|
(survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3ED") & (
|
|
survey_list['NO.'] == 6),
|
|
"ST5 7BT",
|
|
survey_list['Post Code']
|
|
)
|
|
|
|
# Maple avenue (stoke on trent, not newcastle) should be st7 1jw
|
|
survey_list["Post Code"] = np.where(
|
|
(survey_list["Street / Block Name"].str.lower().str.contains("maple avenue")) & (
|
|
survey_list["Post Code"].str.lower() == "st7 1jx"
|
|
),
|
|
"st7 1jw",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
# Hollins Crescent should be st7 1jx
|
|
survey_list["Post Code"] = np.where(
|
|
(survey_list["Street / Block Name"].str.lower().str.contains("hollins crescent")) & (
|
|
survey_list["Post Code"].str.lower() == "st7 1jw"
|
|
),
|
|
"st7 1jx",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
# Additional drops as the above misses some:
|
|
survey_list = survey_list[
|
|
~((survey_list["NO."].astype(str).isin(["18", "42"])) &
|
|
(survey_list["Street / Block Name"] == "Seabridge Lane") &
|
|
(survey_list["Post Code"] == "ST5 3EY") &
|
|
(survey_list["SUBMISSION DATE"].astype(str) == "24.07.2023") &
|
|
(survey_list["INSTALLED OR CANCELLED"].str.contains("NO UPDATE YET")))
|
|
]
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha14_survey_list(survey_list):
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Godfrey Road", "Godfrey Drive"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Oiliver Road", "Oliver Road"
|
|
)
|
|
|
|
# For postodes DE7 4FB, DE7 4EZ, it's actually spelled WINDERMERE AVENUE, not WINDEREMERE AVENUE (without the
|
|
# extra e)
|
|
survey_list.loc[
|
|
(survey_list["Street / Block Name"] == "WINDEREMERE AVENUE") &
|
|
(survey_list["Post Code"].isin(["DE7 4FB", "DE7 4EZ"])),
|
|
"Street / Block Name"
|
|
] = "WINDERMERE AVENUE"
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"MACDONALD SQAURE", "MACDONALD SQUARE"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha15_survey_list(survey_list):
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha16_survey_list(survey_list):
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == "REEDS RD",
|
|
"Reeds ROAD",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
# Replace " rd " with "road"
|
|
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\brd\b', 'road',
|
|
regex=True)
|
|
|
|
# Replace " , " with ", "
|
|
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(
|
|
" , ", ', ',
|
|
)
|
|
# Fix "{place} ,{place}" with "{place}, {place}"
|
|
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\s*,\s*', ', ',
|
|
regex=True)
|
|
# Strip whitespace
|
|
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip()
|
|
|
|
# Correct errors
|
|
survey_list["Post Code"] = np.where(
|
|
survey_list["Post Code"] == "M38 0SA",
|
|
"M38 9SA",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
survey_list["Post Code"] = np.where(
|
|
(survey_list["Street / Block Name"] == "nelson drive") & (survey_list["Post Code"] == "M44 5JE"),
|
|
"M44 5JF",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road",
|
|
"chatley road")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road",
|
|
"plantation avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("how clough drive",
|
|
"howclough drive")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane",
|
|
"brookhurst lane")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("biirch road",
|
|
"birch road")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hadson road",
|
|
"hodson road")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("harbonne avennue",
|
|
"narbonne avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"cumberland road, cadishead",
|
|
"cumberland avenue, cadishead")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("aston field drive",
|
|
"ashton field drive")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wedgewood road",
|
|
"wedgwood road")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hamilton close",
|
|
"hamilton avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"lichens crescent, fitton hill",
|
|
"lichens crescent")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill",
|
|
"south croft")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(", fitton hill", "")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("firtree dr",
|
|
"fir tree avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hawthorne road",
|
|
"hawthorn crescent")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue",
|
|
"reins lee avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("westerhill road",
|
|
"wester hill road")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("st martins road",
|
|
"saint martins road")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("timperley avenue",
|
|
"timperley close")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eastwood road",
|
|
"eastwood avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("new road", "new street")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grassmere road",
|
|
"grasmere road")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hulton road",
|
|
"hulton avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("beechfield avenue",
|
|
"beechfield road")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue",
|
|
"princes avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent",
|
|
"edge fold crescent")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("conniston avenue",
|
|
"coniston avenue")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent",
|
|
"blackthorn crescent")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road",
|
|
"wellstock lane")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brackley avenue",
|
|
"brackley street")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brook avenue swinton",
|
|
"brook avenue, swinton")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("green avenue swinton",
|
|
"green avenue, swinton")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grasmere avenue wardley",
|
|
"grasmere avenue, wardley")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle",
|
|
"mardale avenue, wardle")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("carleach grove",
|
|
"cartleach Grove")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("arbour grove",
|
|
"arbor Grove")
|
|
|
|
# Replacement for clively avenue 66-68
|
|
survey_list["NO."] = np.where(
|
|
survey_list["NO."] == "66-68",
|
|
"66",
|
|
survey_list["NO."]
|
|
)
|
|
|
|
# Delete some duplicated entries
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "york road") &
|
|
(survey_list["NO."].astype(str) == "12") &
|
|
(survey_list["Post Code"] == "M44 5HU") &
|
|
(survey_list["SUBMISSION DATE"].astype(str) == "45229"))
|
|
]
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "peatfield avenue") &
|
|
(survey_list["NO."].astype(str) == "23") &
|
|
(survey_list["Post Code"] == "M27 9XG") &
|
|
(survey_list["SUBMISSION DATE"].astype(str) == "45236"))
|
|
]
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha24_survey_list(survey_list):
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ")
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower()
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip()
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"council house, nidds lane", "nidds lane"
|
|
)
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"wirral avenue", "wirrall avenue"
|
|
)
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"st ives road", "st. ives crescent"
|
|
)
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"sundringham road", "sandringham road"
|
|
)
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"milton avenue", "milton road"
|
|
)
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"st ives crescent", "st. ives crescent"
|
|
)
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"council house, waterbelly lane", "waterbelly lane"
|
|
)
|
|
# Generally remove "councile house, " from the start of the street name
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"council house, ", ""
|
|
)
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"st. leodegars close", "st leodegars close"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"montgomery crescent", "montgomery road"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha28_survey_list(survey_list):
|
|
# Rename the "No" column to "No." to align with the other survey sheets
|
|
survey_list = survey_list.rename(columns={"NO ": "NO."})
|
|
|
|
survey_list["Post Code"] = np.where(
|
|
survey_list["Post Code"] == "ME75HA",
|
|
"ME7 5HA",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ANDREW MANOR/BRITTON ST", "ANDREW MANOR"
|
|
)
|
|
|
|
survey_list["Post Code"] = np.where(
|
|
survey_list["Post Code"] == "ME75TW",
|
|
"ME7 5TW",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ST MARKS HOUSE/SAXON ST", "ST MARKS HOUSE"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha38_survey_list(survey_list):
|
|
# Rename the "No" column to "No." to align with the other survey sheets
|
|
survey_list = survey_list.rename(columns={"NO ": "NO."})
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
'Kingsford Court, Coombe Valley Road', 'Kingsford Court'
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
'LESLIE TEW COURT/DERWENT ROAD', 'LESLIE TEW COURT'
|
|
)
|
|
|
|
# There is no 18A LESLIE TEW COURT in the asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "LESLIE TEW COURT") &
|
|
(survey_list["Post Code"] == "TN10 3TX") &
|
|
(survey_list["NO."] == "18A"))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
'Brindley House, Wellbeck Road', 'Brindley House'
|
|
)
|
|
|
|
# Try taking just the first part of the string, splitting on a /
|
|
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split('/').str[0].str.strip()
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
'HUNTSMAN WAY', 'HUNTSMANS WAY'
|
|
)
|
|
|
|
# Try taking just the first part of the string, splitting on a ,
|
|
survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split(',').str[0].str.strip()
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"McCLAREN COURT", "MCLAREN COURT"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ST JAMES CLOISTERS", "ST. JAMES'S CLOISTERS"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = np.where(
|
|
((survey_list["NO."].isin(
|
|
[
|
|
"FLAT 1 22",
|
|
"FLAT 2 22",
|
|
"FLAT 3 22",
|
|
"FLAT 4 22",
|
|
"FLAT 5 22",
|
|
"FLAT 6 22",
|
|
]
|
|
)) &
|
|
(survey_list["Street / Block Name"] == "MELTON ROAD")),
|
|
"22 MELTON ROAD",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = np.where(
|
|
((survey_list["NO."].isin(
|
|
[
|
|
"FLAT 1 24",
|
|
"FLAT 2 24",
|
|
"FLAT 3 24",
|
|
"FLAT 4 24",
|
|
"FLAT 5 24",
|
|
"FLAT 6 24",
|
|
]
|
|
)) &
|
|
(survey_list["Street / Block Name"] == "MELTON ROAD")),
|
|
"24 MELTON ROAD",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"TURRETT GREEN COURT SILENT STREET", "TURRET GREEN COURT"
|
|
)
|
|
|
|
# Turret green court flat 1 doesn't exist in the asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "TURRET GREEN COURT") &
|
|
(survey_list["NO."] == 1))
|
|
]
|
|
# 3, 45 raywell steet doesn't exist in the asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "45 RAYWELL STREET") &
|
|
(survey_list["NO."] == 3))
|
|
]
|
|
|
|
# 40 Avondale drive doesn't exist in the asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Avondale Drive") &
|
|
(survey_list["NO."] == 40))
|
|
]
|
|
# 17A beech road has the wrong postcode
|
|
survey_list["Post Code"] = np.where(
|
|
(survey_list["Street / Block Name"] == "BEECH ROAD") &
|
|
(survey_list["Post Code"] == "DH6 1JD"),
|
|
"DH6 1JB",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = np.where(
|
|
(survey_list["Street / Block Name"] == "SOUTHVIEW") &
|
|
(survey_list["Post Code"] == "DL16 7DF"),
|
|
"SOUTH VIEW",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
survey_list["Post Code"] = np.where(
|
|
(survey_list["Street / Block Name"] == "BEECH ROAD") &
|
|
(survey_list["Post Code"] == "DH6 1JD"),
|
|
"DH6 1JB",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha32_survey_list(survey_list):
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == "Coxwold",
|
|
"Coxwold Grove",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
# Update the Barringhton Avenue with their correct spelling: Barrington Avenue
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == "Barringhton Avenue",
|
|
"Barrington Avenue",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
# Update how the Rustenburn addresses are listed in the identified addresses
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == "Rustenburg",
|
|
"Rustenburg Street",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
# Update how the MALIN LODGE, RONALDSWAY CLOSE addresses are listed in the identified addresses
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == "MALIN LODGE, RONALDSWAY CLOSE",
|
|
"Malin Lodge",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
# Update how the Feroes Close are listed in the identified addresses
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == "Feroes Close",
|
|
"Faroes Close",
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == 'FORESTER WAY',
|
|
'FORESTER WAY',
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == '6 Zeigfeld',
|
|
'Ziegfeld Court',
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
# Malin Lodge, Ronaldsway Close
|
|
survey_list["Street / Block Name"] = np.where(
|
|
survey_list["Street / Block Name"] == 'Malin Lodge, Ronaldsway Close',
|
|
'Malin Lodge',
|
|
survey_list["Street / Block Name"]
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha50_survey_list(survey_list):
|
|
|
|
survey_list["Post Code"] = np.where(
|
|
(survey_list["Street / Block Name"] == 'COSELEY STREET') &
|
|
(survey_list["Post Code"] == 'ST16 1LR'),
|
|
"ST6 1JU",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
# Remove some of COSELEY STREET, as we have surveys done, outside of the asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "COSELEY STREET") &
|
|
(survey_list["Post Code"] == "ST6 1JU") &
|
|
(survey_list["NO."].isin([96])))
|
|
]
|
|
|
|
survey_list["Post Code"] = survey_list["Post Code"].str.replace("ST33JZ", "ST3 3JZ")
|
|
|
|
# Remove some of Jesmond drive as we have surveys done outside of the asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Jesmond Drive") &
|
|
(survey_list["Post Code"] == "ST3 3JZ") &
|
|
(survey_list["NO."].isin([29])))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"BRUNDELL OVAL", "BRUNDALL OVAL"
|
|
)
|
|
|
|
# Remove 4 Linden Place
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Linden Place") &
|
|
(survey_list["Post Code"] == "ST3 3AT") &
|
|
(survey_list["NO."].isin([4])))
|
|
]
|
|
|
|
# Remove 11 Tilehurst Place
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Tilehurst Place") &
|
|
(survey_list["Post Code"] == "ST3 3AP") &
|
|
(survey_list["NO."].isin([11])))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"deavile road", "DEAVILLE ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"WOOLISCROFT ROAD", "WOOLLISCROFT ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Leak Road", "Leek Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Springfield road", "Springfields road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"MILLWARD RD", "MILLWARD ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"REPINGTON RD", "REPINGTON ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ECCELSTONE PLACE", "ECCLESTONE PLACE"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"St. James Place", "St James Place"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"CHELL HEATH RD", "CHELL HEATH ROAD"
|
|
)
|
|
# Correct postcode
|
|
survey_list["Post Code"] = np.where(
|
|
(survey_list["Street / Block Name"] == 'CHELL HEATH ROAD') &
|
|
(survey_list["Post Code"] == 'ST6 6HU'),
|
|
"ST6 6HJ",
|
|
survey_list["Post Code"]
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Franklin Rd", "Franklin Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Lodge Rd", "Lodge Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"St Matthews Street", "St Matthew Street"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Grove Bank Road", "Grovebank Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"OVERSLEY RD", "OVERSLEY ROAD"
|
|
)
|
|
|
|
# Replace all of the " RD" with " ROAD"
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
" RD", " ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"St. Georges Crescent", "St Georges Crescent"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Tewson Road", "Tewson Green"
|
|
)
|
|
|
|
# Remove 55 Seabridge Lane
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Seabridge Lane") &
|
|
(survey_list["Post Code"] == "ST5 4AG") &
|
|
(survey_list["NO."].isin([55])))
|
|
]
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Tyne Way") &
|
|
(survey_list["Post Code"] == "ST5 4AX") &
|
|
(survey_list["NO."].isin([56])))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"St.Bernards Place", "St Bernard Place"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Penarth Road", "Penarth Grove"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"St. Marys Road", "St Marys Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Larch Drive", "Larch Grove"
|
|
)
|
|
|
|
# Drop 31 Lauder place north, as there is a duplicate. THis version also has a wrong postcode
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "LAUDER PLACE NORTH") &
|
|
(survey_list["Post Code"] == "ST20QS") &
|
|
(survey_list["NO."].isin([31])))
|
|
]
|
|
|
|
# Handle dropping of dupes
|
|
survey_list["street_pruner"] = survey_list["Street / Block Name"].str.lower().str.replace(" ", "")
|
|
survey_list["postcode_pruner"] = survey_list["Post Code"].str.lower().str.replace(" ", "")
|
|
|
|
# Should go to 18
|
|
survey_list = survey_list.drop_duplicates(["NO.", "street_pruner", "postcode_pruner"])
|
|
survey_list = survey_list.drop(columns=["street_pruner", "postcode_pruner"])
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha107_survey_list(survey_list):
|
|
# Replace Front Street, East Stockham with Front Street, East Stockwith
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Front Street, East Stockham", "Front Street, East Stockwith"
|
|
)
|
|
|
|
# Replace "HONEYHOLE L;ANE" with "HONEYHOLES LANE"
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"HONEYHOLE L;ANE", "HONEYHOLES LANE"
|
|
)
|
|
|
|
# Replace "Croft Lane Cherry Willingham, Lincoln" with "Croft Lane, Cherry Willingham, Lincoln"
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Croft Lane Cherry Willingham, Lincoln", "Croft Lane, Cherry Willingham, Lincoln"
|
|
)
|
|
|
|
# Replace "Snelland Road Wickenby, Lincoln" with "Snelland Road, Wickenby, Lincoln"
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln"
|
|
)
|
|
|
|
# Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln"
|
|
)
|
|
|
|
# Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln"
|
|
)
|
|
|
|
# Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln"
|
|
)
|
|
|
|
# Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln"
|
|
)
|
|
|
|
# Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln"
|
|
)
|
|
|
|
# Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln"
|
|
)
|
|
|
|
# Replace SPRINKHILL ROAD with SPINKHILL ROAD
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"SPRINKHILL ROAD", "SPINKHILL ROAD"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha41_survey_list(survey_list):
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha12_survey_list(survey_list):
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Henstone Road", "Hanstone Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Lindern avenue", "Linden Avenue"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"priness way", "Princess Way"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Worth Crecesent", "Worth Crescent"
|
|
)
|
|
|
|
survey_list["Post Code"] = survey_list["Post Code"].str.replace(
|
|
"DY117HA", "DY11 7HA"
|
|
)
|
|
|
|
survey_list["Post Code"] = survey_list["Post Code"].str.replace(
|
|
"DY117HF", "DY11 7HF"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Adderbrook Crescent", "Addenbrooke Crescent"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Kinver Road", "Kinver Avenue"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha13_survey_list(survey_list):
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Woodfarm Road", "WOOD FARM ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ALLANDALE ROAD", "ALLANDALE"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"NEWFIELDS LANE", "NEWFIELD LANE"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"BROADFIELDS ROAD", "BROADFIELD ROAD"
|
|
)
|
|
|
|
survey_list["Post Code"] = survey_list["Post Code"].str.replace(
|
|
"HP2 5SF+", "HP2 5SF"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"PESCOTT HILL", "PESCOT HILL"
|
|
)
|
|
|
|
# This is a duplicate record
|
|
survey_list = survey_list[
|
|
~((survey_list["NO."] == 33) &
|
|
(survey_list["Street / Block Name"] == "Turners Hill") &
|
|
(survey_list["Post Code"] == "HP2 4LH") &
|
|
(survey_list["INSTALLED OR CANCELLED"] == "NO UPDATE - CHECKED 18.12.23"))
|
|
]
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha18_survey_list(survey_list):
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha35_survey_list(survey_list):
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha34_survey_list(survey_list):
|
|
# Note in the asset list
|
|
survey_list = survey_list[
|
|
survey_list["Post Code"] != "L5 3SS"
|
|
]
|
|
|
|
survey_list["Post Code"] = survey_list["Post Code"].str.replace(
|
|
"L177DR", "L17 7DR"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"PENVALLEY CRESENT", "Penvalley Crescent"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"PENLINKEN DRIVE", "Penlinken Drive"
|
|
)
|
|
|
|
# There's no 32 Penlinken Drive in the asset sheet
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Penlinken Drive") &
|
|
(survey_list["NO."] == 32))
|
|
]
|
|
|
|
# There's no 30 Gwent Street in the asset sheet
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "GWENT ST") &
|
|
(survey_list["NO."] == 30))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"POULTON RD", "Poulton Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ST PAULS RD", "St Pauls Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"BROAD LANE, KIRKBY", "BROAD LANE"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"BULLENS RD, KIRKBY", "Bullens Road"
|
|
)
|
|
|
|
# There's no 219 NORTH HILL ST in the asset sheet
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "NORTH HILL ST") &
|
|
(survey_list["NO."] == 219))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"CROSLAND RD, KIRKBY", "CROSLAND ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"PARK BROW DRIVE, KIRKBY", "Park Brow Drive"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"CELTIC TREET", "Celtic Street"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"BUCKLAND ROAD", "Buckland Street"
|
|
)
|
|
|
|
# duplicates
|
|
survey_list = survey_list.drop_duplicates(["Street / Block Name", "NO.", "Post Code"])
|
|
|
|
# This is a duplicate with wrong postcode
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "CLARIBEL STREET") &
|
|
(survey_list["NO."] == 7) &
|
|
(survey_list["Post Code"] == "L8 8AF"))
|
|
]
|
|
|
|
survey_list["NO."] = np.where(
|
|
((survey_list["NO."] == "187 A") &
|
|
(survey_list["Post Code"] == "L32 6QF")),
|
|
"187A",
|
|
survey_list["NO."]
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha56_survey_list(survey_list):
|
|
# Not in asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Samual Street") &
|
|
(survey_list["NO."].isin([22, 24])) &
|
|
(survey_list["Post Code"] == "WA5 1BB"))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"STOURTON RD", "Stourton Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"BIRKIN RD", "Birkin Road"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"PORTLAND RD", "Portland Road"
|
|
)
|
|
|
|
# We remove a row, because two rows match to a block listing
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Tavlin Avenue") &
|
|
(survey_list["NO."] == 17) &
|
|
(survey_list["Post Code"] == "WA5 0EN"))
|
|
]
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha30_survey_list(survey_list):
|
|
|
|
survey_list = survey_list[~pd.isnull(survey_list["Post Code"])]
|
|
|
|
# Split on / and take the first half
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0]
|
|
|
|
# Not in the asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Horsebridge Road") &
|
|
(survey_list["NO."] == 286))
|
|
]
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "DUTTON WAY") &
|
|
(survey_list["NO."] == 9))
|
|
]
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "PAYTHORNE CLOSE") &
|
|
(survey_list["NO."] == 10))
|
|
]
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "MARCHWOOD ROAD") &
|
|
(survey_list["NO."] == 11))
|
|
]
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Otterburn Close") &
|
|
(survey_list["NO."] == 4))
|
|
]
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Blossom Court") &
|
|
(survey_list["NO."] == 5))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"St LUKES CLOSE , HUNTINGDON", "St. Lukes Close"
|
|
)
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "St. Lukes Close") &
|
|
(survey_list["NO."].isin([4, 7, 8])))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ROMAN WAY , GODMANCHESTER , HUNTINGDON", "Roman Way"
|
|
)
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Roman Way") &
|
|
(survey_list["NO."].isin([58])))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"HEADLANDS , FENSTANTON , HUNTINGDON", "Headlands Fenstanton"
|
|
)
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Headlands Fenstanton") &
|
|
(survey_list["NO."].isin([126, 134])))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"WALLACE COURT , HUNTINGDON", "Wallace Court"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"CRICKETERS WAY , CHATTERIS", "Cricketers Way"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Jubilee Gardens", "Jubilee Green"
|
|
)
|
|
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "Harrow Road") &
|
|
(survey_list["NO."].isin([10])))
|
|
]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ST LUKES CLOSE", "St. Lukes Close"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha49_survey_list(survey_list):
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha8_survey_list(survey_list):
|
|
# Split on / and take the first half
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0]
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"WESTONIA COURT HOUSE", "Westonia Court"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Hillesdon Avenue", "Hillesden Avenue"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Weston Street", "Western Street"
|
|
)
|
|
|
|
# Remove placeholder rows where postcode is missing
|
|
survey_list = survey_list[
|
|
~pd.isnull(survey_list["Post Code"])
|
|
]
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha11_survey_list(survey_list):
|
|
# Remove 39 HOLLYWOOD WAY as it's not in the asset list
|
|
survey_list = survey_list[
|
|
~((survey_list["Street / Block Name"] == "HOLLYWOOD WAY") &
|
|
(survey_list["NO."] == 39))
|
|
]
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha42_survey_list(survey_list):
|
|
# original asset list has nothing in the street
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Turnstone Terrace", ""
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Pegasus place", ""
|
|
)
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha45_survey_list(survey_list):
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Norwich Road", "Norwich Avenue"
|
|
)
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha51_survey_list(survey_list):
|
|
survey_list = survey_list.rename(columns={"NO ": "NO."})
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Autum Close", "Autumn Close"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha52_survey_list(survey_list):
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Mardalle Avenue", "Mardale Avenue"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Ollerton Close, Grappenhall", "Ollerton Close"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Bradshaw Road, Grappenhall", "Bradshaw Lane"
|
|
)
|
|
|
|
# Drop a bunch of dupes
|
|
survey_list = survey_list.drop_duplicates(["NO.", "Street / Block Name", "Post Code"])
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha5_survey_list(survey_list):
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha20_survey_list(survey_list):
|
|
# Not in the asset list
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Abbot Close", "ABBOTS CLOSE"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Downbarns Road", "DOWN BARNS ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"Austin Lane", "AUSTINS LANE"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"South Park Way", "SOUTHPARK WAY"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"OAKLAND ROAD", "OAKWOOD ROAD"
|
|
)
|
|
|
|
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
|
"ACRE WAY/NORTHWOOD", "ACRE WAY"
|
|
)
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def levenstein_match(matching_string, df):
|
|
match_to = df["matching_address"].tolist()
|
|
# Strip out punctuation and spaces
|
|
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
|
|
match_to = [x.replace(" ", "") for x in match_to]
|
|
|
|
# Perform matching between full key and match_to
|
|
distances = [100 - fuzz.ratio(matching_string, s) for s in match_to]
|
|
|
|
best_match_index = distances.index(min(distances))
|
|
# We might want to consider a threshold for the distance, however for the momeny,
|
|
# we don't consider this for the moment
|
|
df = df.iloc[best_match_index:best_match_index + 1]
|
|
|
|
return df
|
|
|
|
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
|
|
|
|
# Correct the survey list
|
|
survey_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_survey_list")
|
|
survey_list = survey_list_correction_function(survey_list)
|
|
|
|
missed_postcodes = []
|
|
if ha_name in ["HA6", "HA34"]:
|
|
missed_postcodes = [
|
|
postcode.lower() for postcode in survey_list["Post Code"] if
|
|
postcode.lower() not in asset_list["matching_postcode"].values
|
|
]
|
|
|
|
if ha_name == "HA13":
|
|
missed_postcodes = ["hp17 8le"]
|
|
|
|
if ha_name == "HA56":
|
|
# Multiple properties are listed as blocks, which is a problem for matching
|
|
missed_postcodes = ["sk17 6nr", "wa5 0en"]
|
|
|
|
matching_lookup = []
|
|
for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
|
|
|
|
house_number = row["NO."]
|
|
if isinstance(house_number, str):
|
|
house_number = house_number.lower().strip()
|
|
|
|
# Filter on the first line of the address
|
|
df = asset_list[
|
|
asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip())
|
|
].copy()
|
|
|
|
if not any(df["matching_address"].str.contains(str(house_number))):
|
|
if "flat" in str(house_number):
|
|
house_number = house_number.split("flat")[1].strip()
|
|
|
|
# We check if we had an instance of flat x, y
|
|
if "," in str(house_number):
|
|
house_number = house_number.split(",")[0].strip()
|
|
|
|
# We may also have a space for an instance of flat x y
|
|
if " " in str(house_number):
|
|
house_number = house_number.split(" ")[0].strip()
|
|
|
|
df = df[df["matching_address"].str.contains(str(house_number))]
|
|
|
|
if df.empty:
|
|
|
|
postcode_lower = row["Post Code"].lower()
|
|
if postcode_lower in missed_postcodes:
|
|
matching_lookup.append(
|
|
{
|
|
"survey_list_row_id": row["survey_list_row_id"],
|
|
"asset_list_row_id": None,
|
|
}
|
|
)
|
|
continue
|
|
|
|
print(row["Street / Block Name"])
|
|
print(house_number)
|
|
print(row["Post Code"])
|
|
raise ValueError("Investigate")
|
|
|
|
if df.shape[0] != 1:
|
|
df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
|
|
if df.shape[0] != 1:
|
|
df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())]
|
|
|
|
if df.empty:
|
|
|
|
postcode_lower = row["Post Code"].lower()
|
|
if postcode_lower in missed_postcodes:
|
|
matching_lookup.append(
|
|
{
|
|
"survey_list_row_id": row["survey_list_row_id"],
|
|
"asset_list_row_id": None,
|
|
}
|
|
)
|
|
continue
|
|
|
|
if df.shape[0] != 1:
|
|
if "Town/Area" not in row.keys():
|
|
full_key = (str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() +
|
|
row["Post Code"].lower().strip())
|
|
else:
|
|
full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + \
|
|
row["Town/Area"].lower().strip() + row["Post Code"].lower().strip()
|
|
# Remove any spaces from the full key
|
|
full_key = full_key.replace(" ", "")
|
|
|
|
df = self.levenstein_match(full_key, df)
|
|
|
|
if df.shape[0] != 1:
|
|
print(row["Street / Block Name"])
|
|
print(house_number)
|
|
print(row["Post Code"])
|
|
raise ValueError("Investigate")
|
|
|
|
matching_lookup.append(
|
|
{
|
|
"survey_list_row_id": row["survey_list_row_id"],
|
|
"asset_list_row_id": df["asset_list_row_id"].values[0],
|
|
}
|
|
)
|
|
|
|
matching_lookup = pd.DataFrame(matching_lookup)
|
|
|
|
if matching_lookup.shape[0] != survey_list.shape[0]:
|
|
raise ValueError("Mismatch in the number of survey rows and matching lookup rows")
|
|
|
|
matching_lookup = matching_lookup[~pd.isnull(matching_lookup["asset_list_row_id"])]
|
|
|
|
if matching_lookup["asset_list_row_id"].duplicated().sum():
|
|
raise ValueError("Duplicated matches in survey list")
|
|
|
|
# Merge onto the survey list
|
|
survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id")
|
|
|
|
# TEMP FOR NEWER WORK
|
|
# matching_lookup = matching_lookup.merge(
|
|
# asset_list[["asset_list_row_id", "UPRN"]], how="left", on="asset_list_row_id"
|
|
# ).merge(
|
|
# survey_list[["survey_list_row_id", "NO.", "Street / Block Name", "Post Code"]],
|
|
# how="left", on="survey_list_row_id"
|
|
# )
|
|
# matching_lookup.to_csv(
|
|
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane/surveys_to_assets.csv"
|
|
# )
|
|
|
|
return survey_list
|
|
|
|
@staticmethod
|
|
def correct_ha25_eco3_list(eco3_list):
|
|
# NEADS DRIVE, postcode with bs305dt, is not found in the asset list
|
|
eco3_list = eco3_list[
|
|
~(eco3_list["Post Code"] == "BS305DT")
|
|
]
|
|
# Drop rows with missings postcode
|
|
eco3_list = eco3_list[
|
|
~pd.isnull(eco3_list["Post Code"])
|
|
]
|
|
# We have a bunch of genuine duplicates
|
|
eco3_list = eco3_list.drop_duplicates(["NO ", "Street / Block Name", "Post Code"])
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"HALWILL MEADOOW", "HALWILL MEADOW"
|
|
)
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"Hall Road", "Hall Rd"
|
|
)
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"SPRINGFIELD WAY SAINT DAY", "SPRINGFIELD WAY ST DAY"
|
|
)
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"BOND SPEAR COURT", "BOND-SPEAR COURT"
|
|
)
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"ST.MARYS HILL", "ST MARYS HILL"
|
|
)
|
|
# Correct the postcode for edmund road
|
|
eco3_list["Post Code"] = np.where(
|
|
(eco3_list["Street / Block Name"] == "EDMUND ROAD") &
|
|
(eco3_list["Post Code"] == "TR14 8QJ"),
|
|
"TR15 1BY",
|
|
eco3_list["Post Code"]
|
|
)
|
|
return eco3_list
|
|
|
|
@staticmethod
|
|
def correct_ha50_eco3_list(eco3_list):
|
|
return eco3_list
|
|
|
|
@staticmethod
|
|
def correct_ha41_eco3_list(eco3_list):
|
|
return eco3_list
|
|
|
|
@staticmethod
|
|
def correct_ha63_eco3_list(eco3_list):
|
|
eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]
|
|
# Some postcode that aren't in the asset list
|
|
eco3_list = eco3_list[
|
|
~eco3_list["Post Code"].isin(
|
|
["NR32 15X", "NR30 2BT"]
|
|
)
|
|
]
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"POUND COTTAGES - BLOOMSBERRY CLOSE", "POUND COTTAGES"
|
|
)
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"FREDRICK ROAD", "Frederick Road"
|
|
)
|
|
|
|
# For denmark street, remove the space from the house number
|
|
eco3_list["NO "] = np.where(
|
|
eco3_list["Street / Block Name"] == "DENMARK STREET",
|
|
eco3_list["NO "].str.replace(" ", ""),
|
|
eco3_list["NO "]
|
|
)
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"OLD HOSPITAL MEWS HOSPITAL WALK", "Old Hospital Mews"
|
|
)
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"Portland House, Portland Street", "Portland House"
|
|
)
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"MIDDLE MARKET STREET", "Middle Market Road"
|
|
)
|
|
|
|
return eco3_list
|
|
|
|
@staticmethod
|
|
def correct_ha117_eco3_list(eco3_list):
|
|
# Delete rows where postcode is null - there are some placeholder rows where this happens
|
|
eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"TARRING ROAD", "155 TARRING ROAD"
|
|
)
|
|
|
|
return eco3_list
|
|
|
|
@staticmethod
|
|
def correct_ha56_eco3_list(eco3_list):
|
|
eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])]
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"Mount Pleasant, Crewe", "Mount Pleasant"
|
|
)
|
|
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"Dutton Close", "Dutton Way"
|
|
)
|
|
|
|
eco3_list["Post Code"] = eco3_list["Post Code"].str.replace(
|
|
"Ls63nl", "LS6 3NL"
|
|
)
|
|
|
|
# Handle a duplicate
|
|
eco3_list = eco3_list[
|
|
~((eco3_list["Street / Block Name"] == "Mount Pleasant") &
|
|
(eco3_list["Post Code"] == "CW1 3JF") &
|
|
(eco3_list["NO "] == 5) &
|
|
(eco3_list["INSTALL/ CANCELLATION DATE"] == "CANCELLED 20.5.2022"))
|
|
]
|
|
|
|
return eco3_list
|
|
|
|
@staticmethod
|
|
def correct_ha51_eco3_list(eco3_list):
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"HASELEMERE AVENUE", "HASLEMERE AVENUE"
|
|
)
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"THORVILLE GROVE", "THORNVILLE GROVE"
|
|
)
|
|
eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
|
|
"MONTBRETA CLOSE", "MONTBRETIA CLOSE"
|
|
)
|
|
eco3_list["Post Code"] = np.where(
|
|
(eco3_list["Street / Block Name"] == "SYDENHAM ROAD") &
|
|
(eco3_list["Post Code"] == "CR0 2DW"),
|
|
"CR0 2ED",
|
|
eco3_list["Post Code"]
|
|
)
|
|
# Not in asset list
|
|
eco3_list = eco3_list[
|
|
~((eco3_list["Street / Block Name"] == "WOODLEY LANE") &
|
|
(eco3_list["Post Code"] == "SM5 2RJ") &
|
|
(eco3_list["NO "] == "FLAT 3, 11"))
|
|
]
|
|
|
|
eco3_list["NO "] = np.where(
|
|
(eco3_list["NO "] == "47 B"),
|
|
"47B",
|
|
eco3_list["NO "]
|
|
)
|
|
|
|
return eco3_list
|
|
|
|
def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
|
|
|
|
eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list")
|
|
eco3_list = eco3_list_correction_function(eco3_list)
|
|
|
|
asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower()
|
|
eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "")
|
|
|
|
if ha_name in ["HA25", "HA56", "HA51"]:
|
|
# HA25: 317 -> 259
|
|
missed_postcodes = {
|
|
postcode for postcode in eco3_list["postcode_no_space"] if
|
|
postcode not in asset_list["matching_postcode_nospace"].values
|
|
}
|
|
|
|
eco3_list = eco3_list[~eco3_list["postcode_no_space"].isin(missed_postcodes)]
|
|
|
|
# For the asset list, we create a matching address without any punctuation
|
|
# TODO: We should generally just remove puncutation from addresses when matching
|
|
asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(
|
|
r'[^\w\s]', '', regex=True
|
|
)
|
|
# Remove double spaces
|
|
asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace(
|
|
" ", " "
|
|
)
|
|
|
|
matching_lookup = []
|
|
missed = []
|
|
for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
|
|
# if row["eco3_list_row_id"] == "HA51_Eco3_22":
|
|
# raise Exception()
|
|
postcode = row["postcode_no_space"]
|
|
|
|
# df will never be empty, since we've already done a check for common postcodes
|
|
df = asset_list[
|
|
asset_list["matching_postcode_nospace"].str.contains(postcode)
|
|
]
|
|
|
|
house_number = row["NO "]
|
|
if isinstance(house_number, str):
|
|
house_number = house_number.lower().strip()
|
|
|
|
if not any(df["HouseNo"].str.contains(str(house_number))):
|
|
if "flat" in str(house_number):
|
|
house_number = house_number.split("flat")[1].strip()
|
|
|
|
# We check if we had an instance of flat x, y
|
|
if "," in str(house_number):
|
|
house_number = house_number.split(",")[0].strip()
|
|
|
|
# We may also have a space for an instance of flat x y
|
|
if " " in str(house_number):
|
|
house_number = house_number.split(" ")[0].strip()
|
|
|
|
# We must do the house number filter
|
|
df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
|
|
|
|
# Perform a search on streetname
|
|
# We do this to prevent duplicate matches to properties with the same postcode and house number,
|
|
# but different streets
|
|
street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0]
|
|
street_name_section1 = re.sub(r'[^\w\s]', '', street_name_section1)
|
|
df = df[df["matching_address_no_punctuation"].str.contains(street_name_section1)]
|
|
|
|
if df.empty:
|
|
missed.append(row["eco3_list_row_id"])
|
|
continue
|
|
|
|
if df.shape[0] > 1:
|
|
if "flat" in str(row["NO "]).lower():
|
|
df = df[df["matching_address"].str.contains("flat")]
|
|
else:
|
|
df = df[~df["matching_address"].str.contains("flat")]
|
|
|
|
if df.shape[0] != 1:
|
|
print(row["Street / Block Name"])
|
|
print(house_number)
|
|
print(row["Post Code"])
|
|
raise ValueError("Investigate")
|
|
|
|
matching_lookup.append(
|
|
{
|
|
"eco3_list_row_id": row["eco3_list_row_id"],
|
|
"asset_list_row_id": df["asset_list_row_id"].values[0],
|
|
}
|
|
)
|
|
|
|
# We verify the missed
|
|
# HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2
|
|
# where many surveys were conducted on house numbers, not in the asset list
|
|
# 154 missed, 2827 matched for HA 25
|
|
# For HA56, the number of missed is high at 320, however a big portion of these are due to the block being
|
|
# listed in the asset list, and individual units being in the survey list
|
|
if len(missed) != self.UNMATCHED_ECO3[ha_name]:
|
|
raise ValueError(
|
|
f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched"
|
|
)
|
|
|
|
matching_lookup = pd.DataFrame(matching_lookup)
|
|
# Check dupes as this will cause problems later on
|
|
if matching_lookup["asset_list_row_id"].duplicated().sum():
|
|
raise ValueError("Duplicated asset list row ids")
|
|
|
|
# Merge onto eco3 list
|
|
eco3_list = eco3_list.merge(matching_lookup, how="left", on="eco3_list_row_id")
|
|
|
|
asset_list.drop(columns=["matching_address_no_punctuation"], inplace=True)
|
|
|
|
return eco3_list
|
|
|
|
@staticmethod
|
|
def extract_streetname(address, house_number=None, postcode=None):
|
|
"""
|
|
Cleans an address by removing the house number and postcode, and converts everything to lower case.
|
|
|
|
:param address: The full address as a string.
|
|
:param house_number: The house number to remove, as a string or integer.
|
|
:param postcode: The postcode to remove, as a string.
|
|
:return: The cleaned address.
|
|
"""
|
|
# Convert everything to lower case
|
|
address = address.lower()
|
|
|
|
if house_number is not None:
|
|
# Remove the house number
|
|
address = re.sub(r'\b{}\b'.format(house_number), '', address, flags=re.IGNORECASE).strip()
|
|
|
|
if postcode is not None:
|
|
# Remove the postcode
|
|
address = re.sub(r'\b{}\b'.format(re.escape(postcode)), '', address, flags=re.IGNORECASE).strip()
|
|
|
|
# Get first section before a comma
|
|
address = address.split(",")[0]
|
|
# Additional cleaning to remove extra spaces and commas left over
|
|
address = re.sub(r'\s+', ' ', address) # Replace multiple spaces with a single space
|
|
address = re.sub(r'\s*,\s*', ', ', address) # Clean up space around commas
|
|
|
|
return address
|
|
|
|
def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name):
|
|
matching_lookup = []
|
|
unmatched_addresses = []
|
|
|
|
for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)):
|
|
|
|
house_number = row["HouseNo"]
|
|
if isinstance(house_number, str):
|
|
house_number = house_number.lower().strip()
|
|
|
|
# Filter on the postcode
|
|
df = asset_list[
|
|
asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip())
|
|
].copy()
|
|
|
|
df = df[df["HouseNo"].astype(str) == str(house_number)]
|
|
# For ciga, we skip
|
|
if df.empty:
|
|
unmatched_addresses.append(
|
|
{
|
|
"ciga_list_row_id": row["ciga_list_row_id"],
|
|
"HouseNo": house_number,
|
|
"Matched Postcode": row["Matched Postcode"]
|
|
}
|
|
)
|
|
continue
|
|
|
|
if df.shape[0] != 1:
|
|
|
|
# We split house number and postcode out of the matched address for ciga
|
|
street_name = self.extract_streetname(
|
|
address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"]
|
|
)
|
|
# We check if any of the rows contains the street name and if they do, filter
|
|
if any(df["matching_address"].str.replace(",", "").str.contains(street_name)):
|
|
df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)]
|
|
|
|
if df.shape[0] != 1:
|
|
# The final check we do here is to check for the presence of flat in the address
|
|
if "flat" in row["Matched Address"].lower():
|
|
df = df[df["matching_address"].str.contains("flat")]
|
|
else:
|
|
df = df[df["matching_address"].str.contains("flat") == False]
|
|
|
|
if df.shape[0] != 1:
|
|
full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[
|
|
"Matched Postcode"].lower().strip()
|
|
# Remove any spaces from the full key
|
|
full_key = full_key.replace(" ", "")
|
|
df = self.levenstein_match(full_key, df)
|
|
|
|
if df.shape[0] != 1:
|
|
print(row["Street / Block Name"])
|
|
print(house_number)
|
|
print(row["Post Code"].lower())
|
|
raise ValueError("Investigate")
|
|
|
|
matching_lookup.append(
|
|
{
|
|
"ciga_list_row_id": row["ciga_list_row_id"],
|
|
"asset_list_row_id": df["asset_list_row_id"].values[0],
|
|
}
|
|
)
|
|
|
|
# We have an acceptable number of ciga failures for each HA
|
|
if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
|
|
raise ValueError(
|
|
f"Unmatched addresses for {ha_name} is not as expected, got {len(unmatched_addresses)} unmatched")
|
|
|
|
matching_lookup = pd.DataFrame(matching_lookup)
|
|
|
|
# Check dupes as this will cause problems later on
|
|
if matching_lookup["asset_list_row_id"].duplicated().any():
|
|
raise ValueError("Duplicated asset list row ids")
|
|
|
|
# Merge onto the ciga list
|
|
ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id")
|
|
|
|
return ciga_list
|
|
|
|
@staticmethod
|
|
def identify_built_form_ha6(property_string):
|
|
"""
|
|
Identify the built form of a property from the given string.
|
|
|
|
:param property_string: The string describing the property
|
|
:return: The identified built form, or None if it cannot be identified
|
|
"""
|
|
# Define keywords for each built form
|
|
built_forms = {
|
|
'Semi-Detached': ['semi detached'],
|
|
'Detached': ['detached'],
|
|
'Mid-Terrace': ['mid terrace', 'mid town house'],
|
|
'End-Terrace': ['end terrace', 'end town house']
|
|
}
|
|
|
|
# Normalize the input string to lower case for comparison
|
|
property_string_normalized = property_string.lower()
|
|
|
|
# Search for each built form keyword in the input string
|
|
for built_form, keywords in built_forms.items():
|
|
for keyword in keywords:
|
|
if keyword in property_string_normalized:
|
|
return built_form
|
|
|
|
# Return None if no built form is identified
|
|
return None
|
|
|
|
def load(self):
|
|
|
|
# Get the december figures, which is just a csv
|
|
self.december_figures = pd.read_csv(self.december_figures_filepath)
|
|
# Remove the spaces in HA Name
|
|
self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "")
|
|
for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]:
|
|
self.december_figures[col] = self.december_figures[col].astype("Int64")
|
|
|
|
if self.use_cache and not self.rebuild:
|
|
data = read_pickle_from_s3(
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name="ha-analysis/batch3-inputs.pickle",
|
|
)
|
|
else:
|
|
data = {}
|
|
|
|
for filepath in self.directories:
|
|
ha_name = filepath.split("/")[2]
|
|
if ha_name in data:
|
|
continue
|
|
# Load asset list
|
|
logger.info("Loading data for {}".format(ha_name))
|
|
asset_list, survey_list, ciga_list, eco3_list = self.load_asset_list(
|
|
filepath=filepath,
|
|
ha_name=ha_name,
|
|
)
|
|
|
|
data[ha_name] = {
|
|
"asset_list": asset_list,
|
|
"survey_list": survey_list,
|
|
"ciga_list": ciga_list,
|
|
"eco3_list": eco3_list
|
|
}
|
|
|
|
self.data = data
|
|
|
|
# Cache the data in s3
|
|
# We need to pickle the data and store in s3
|
|
save_pickle_to_s3(
|
|
data=self.data,
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name="ha-analysis/batch3-inputs.pickle",
|
|
)
|
|
|
|
def ha_facts_and_figures(self):
|
|
"""
|
|
This function will return a dictionary of facts and figures for each HA
|
|
:return:
|
|
"""
|
|
|
|
scheme_map = {
|
|
"ECO4": "ECO4",
|
|
"AFFORDABLE WARMTH": "ECO4",
|
|
"ECO4 A/W": "ECO4",
|
|
"ECO4 GBIS (ECO+)": "GBIS",
|
|
"ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS",
|
|
"ECO4 AFFORDABLE WARMTH": "ECO4",
|
|
"Affordable Warmth": "ECO4",
|
|
"ECO4 GBIS (ECO+) JJC UNDER 73m² ": "GBIS",
|
|
"ECO4 PPS": "ECO4",
|
|
"AFFORDABLE WARMTH / REMEDIAL": "ECO4",
|
|
"AFF0RDALE WARMTH": "ECO4",
|
|
"ECO 4 RdSAP CL": "ECO4",
|
|
"Affordable Warmth (R) ": "ECO4",
|
|
"Affordable Warmth ": "ECO4",
|
|
"ECO 4 AFFORDABLE WARMTH": "ECO4",
|
|
}
|
|
|
|
# Since it seems like "subject to archetype check" has some failure conditions, for simplicity, we
|
|
# treat these as similar to subject to CIGA, and therefore unconfirmed worked that could fail. There
|
|
# are only a small volume of properties for which we see this
|
|
eco_eligibility_map = {
|
|
"not eligble": "not eligible",
|
|
"eco 4(subject to ciga)": "eco4 (subject to ciga)",
|
|
"eco4 (subject to ciga/archetype check": "eco4 (subject to ciga) (subject to archetype)",
|
|
"eco4 (subject to archetype check)": "eco4 (subject to archetype)",
|
|
"eco4 (subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)",
|
|
"eco4 (subject to ciga)": "eco4 (subject to ciga)",
|
|
"eco4(subject to ciga)": "eco4 (subject to ciga)",
|
|
"eco4 subject to ciga": "eco4 (subject to ciga)",
|
|
"eco4 (subject to archetype/ciga)": "eco4 (subject to ciga) (subject to archetype)",
|
|
"eco4( subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)",
|
|
"eco4 (subject to ciga/ archetype)": "eco4 (subject to ciga) (subject to archetype)",
|
|
}
|
|
|
|
ha_facts_and_figures = []
|
|
for ha_name, data_assets in self.data.items():
|
|
asset_list = data_assets["asset_list"].copy()
|
|
survey_list = data_assets["survey_list"].copy()
|
|
ciga_list = data_assets["ciga_list"].copy()
|
|
eco3_list = data_assets.get("eco3_list", pd.DataFrame())
|
|
|
|
asset_list_starting_size = asset_list.shape[0]
|
|
|
|
# Change the column name if it's ECO eligibility
|
|
asset_list = asset_list.rename(
|
|
columns={
|
|
"ECO eligibility": "ECO Eligibility",
|
|
"ECO Eligibilty": "ECO Eligibility",
|
|
},
|
|
)
|
|
# Remove surplus whitespace from the ECO Eligibility column
|
|
asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.strip()
|
|
# Push to lower case
|
|
asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.lower()
|
|
# Remap
|
|
asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].replace(eco_eligibility_map)
|
|
|
|
if not ciga_list.empty:
|
|
# We merge on ciga and update the status to reflect if it has failed ciga or not
|
|
# If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA
|
|
# check
|
|
|
|
ciga_list_to_merge = ciga_list[["asset_list_row_id", "Guarantee"]].copy()
|
|
ciga_list_to_merge = ciga_list_to_merge[~pd.isnull(ciga_list_to_merge["asset_list_row_id"])]
|
|
|
|
asset_list = asset_list.merge(ciga_list_to_merge, how='left', on="asset_list_row_id")
|
|
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
(
|
|
asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) &
|
|
(asset_list["Guarantee"] == "Yes")
|
|
),
|
|
"failed ciga",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
# We replace any remaining "Subject to CIGA" with pass Ciga
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
(
|
|
asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) &
|
|
(asset_list["Guarantee"] == "No")
|
|
),
|
|
"eco4 - passed ciga",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
asset_list = asset_list.drop(columns=["Guarantee"])
|
|
|
|
# Update the asset list with the categorisations and rename changes
|
|
if asset_list.shape[0] != asset_list_starting_size:
|
|
raise ValueError("The asset list has changed in size")
|
|
|
|
# If we have eco3 surveys, we set a property to not eligible
|
|
if not eco3_list.empty:
|
|
eco3_list_to_merge = eco3_list[["asset_list_row_id"]].copy()
|
|
eco3_list_to_merge["has_eco3"] = True
|
|
asset_list = asset_list.merge(
|
|
eco3_list_to_merge, how="left", on="asset_list_row_id"
|
|
)
|
|
|
|
if asset_list.shape[0] != asset_list_starting_size:
|
|
raise ValueError("The asset list has changed in size, when merging on eco3")
|
|
|
|
# Any rows that have an eco3 survey are set to not eligible
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
asset_list["has_eco3"] == True,
|
|
"not eligible",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
# asset_list = asset_list.drop(columns=["has_eco3"])
|
|
|
|
# Report on sales
|
|
sales_report = {}
|
|
if not survey_list.empty:
|
|
scheme_column = survey_list.columns[0]
|
|
# Remap the values in the scheme column
|
|
survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map)
|
|
# We clean up the survey list installation or cancelled
|
|
if "INSTALLED OR CANCELLED" in survey_list.columns:
|
|
survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower()
|
|
# Remove all punctuation
|
|
survey_list["installed_or_cancelled_clean"] = survey_list[
|
|
"installed_or_cancelled_clean"].str.replace(
|
|
r'[^\w\s]', '', regex=True
|
|
)
|
|
# Remove double spaces
|
|
survey_list["installed_or_cancelled_clean"] = survey_list[
|
|
"installed_or_cancelled_clean"].str.replace(
|
|
r'\s+', ' ', regex=True
|
|
)
|
|
# Remove trailing spaces
|
|
survey_list["installed_or_cancelled_clean"] = survey_list[
|
|
"installed_or_cancelled_clean"].str.strip()
|
|
|
|
survey_list["installation_status"] = None
|
|
survey_list["installation_status"] = np.where(
|
|
survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]),
|
|
"installed",
|
|
survey_list["installation_status"]
|
|
)
|
|
survey_list["installation_status"] = np.where(
|
|
survey_list["installed_or_cancelled_clean"].isin(["cancelled"]),
|
|
"cancelled",
|
|
survey_list["installation_status"]
|
|
)
|
|
# Find partial installations
|
|
survey_list["installation_status"] = np.where(
|
|
survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"),
|
|
"in progress",
|
|
survey_list["installation_status"]
|
|
)
|
|
# Find partial cancellations
|
|
# TODO: We might have more indications of partial cancellations
|
|
survey_list["installation_status"] = np.where(
|
|
survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]),
|
|
"cancelled",
|
|
survey_list["installation_status"]
|
|
)
|
|
else:
|
|
# We have some examples, e.g. HA28, where we do not have the installed or cancelled column
|
|
if 'INSTALL/ CANCELLATION DATE' in survey_list.columns:
|
|
survey_list["installation_status"] = np.where(
|
|
survey_list['INSTALL/ CANCELLATION DATE'].str.lower().str.contains("cancelled"),
|
|
"cancelled",
|
|
"installed",
|
|
)
|
|
else:
|
|
survey_list["installation_status"] = np.where(
|
|
survey_list['INSTALL / CANCELLATION DATE'].str.lower().str.contains("cancelled"),
|
|
"cancelled",
|
|
"installed",
|
|
)
|
|
|
|
# Finally, for other cases, we set the status to "in progress"
|
|
survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress")
|
|
|
|
# We concatenate the scheme name with the installation status
|
|
survey_list["installation_status"] = (
|
|
survey_list[scheme_column] + " - " + survey_list["installation_status"]
|
|
)
|
|
|
|
# We get the sales
|
|
sales_report = {
|
|
"ECO4 - surveys sold": survey_list.shape[0],
|
|
**survey_list["installation_status"].value_counts().to_dict()
|
|
}
|
|
|
|
# We find some cases where properties have sold but are missing CIGA checks
|
|
survey_list_to_merge = survey_list[["asset_list_row_id", "installation_status"]].copy()
|
|
survey_list_to_merge["has_a_survey_record"] = True
|
|
survey_list_to_merge = survey_list_to_merge[~pd.isnull(survey_list_to_merge["asset_list_row_id"])]
|
|
|
|
asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id")
|
|
# Update the cases where properties have sold, but are missing a CIGA check
|
|
# If we don't have a CIGA list, we set the value to ECO4
|
|
set_to = "eco4 - passed ciga" if not ciga_list.empty else "eco4"
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
(asset_list["ECO Eligibility"].str.contains("subject to ciga")) & (
|
|
asset_list["has_a_survey_record"] == True
|
|
),
|
|
set_to,
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
# Update the cases where a property has been marked as eligible for GBIS, but sold for ECO4
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
(asset_list["ECO Eligibility"] == "gbis") & (
|
|
asset_list["installation_status"].isin(
|
|
["ECO4 - installed", "ECO4 - cancelled", "ECO4 - in progress"]
|
|
)
|
|
),
|
|
"eco4",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
# Update the cases where a property was marked as eligible for ECO4, but sold for GBIS
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
(asset_list["ECO Eligibility"].isin(
|
|
[
|
|
"eco4",
|
|
"eco4 (subject to ciga)",
|
|
"eco4 - passed ciga",
|
|
"failed ciga",
|
|
"eco4 (subject to archetype)",
|
|
"eco4 (subject to ciga) (subject to archetype)"
|
|
]
|
|
)) & (
|
|
asset_list["installation_status"].isin(
|
|
["GBIS - installed", "GBIS - cancelled", "GBIS - in progress"]
|
|
)
|
|
),
|
|
"gbis",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
# Update the cases where a property is marked as not eligible, but sold for GBIS
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
(asset_list["ECO Eligibility"] == "not eligible") & (
|
|
asset_list["installation_status"].isin(
|
|
["GBIS - in progress", "GBIS - installed", "GBIS - cancelled"]
|
|
)),
|
|
"gbis",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
# Update the cases where a property is marked as not eligible, but sold for ECO4
|
|
asset_list["ECO Eligibility"] = np.where(
|
|
(asset_list["ECO Eligibility"] == "not eligible") & (
|
|
asset_list["installation_status"].isin(
|
|
["ECO4 - in progress", "ECO4 - installed", "ECO4 - cancelled"]
|
|
)
|
|
),
|
|
"eco4",
|
|
asset_list["ECO Eligibility"]
|
|
)
|
|
|
|
asset_list = asset_list.drop(columns=["has_a_survey_record", "installation_status"])
|
|
|
|
# Update the survey list with installation status
|
|
self.data[ha_name]["survey_list"] = survey_list
|
|
|
|
# Insert updated asset list
|
|
self.data[ha_name]["asset_list"] = asset_list
|
|
|
|
ha_facts_and_figures.append(
|
|
{
|
|
"HA Name": ha_name,
|
|
**asset_list["ECO Eligibility"].value_counts().to_dict(),
|
|
**sales_report
|
|
}
|
|
)
|
|
|
|
ha_facts_and_figures = pd.DataFrame(ha_facts_and_figures)
|
|
ha_facts_and_figures = ha_facts_and_figures.drop(
|
|
columns=["not eligible"]
|
|
)
|
|
|
|
ha_facts_and_figures = ha_facts_and_figures.fillna(0)
|
|
# Make all columns apart from HA NAme integers
|
|
for col in ha_facts_and_figures.columns[1:]:
|
|
ha_facts_and_figures[col] = ha_facts_and_figures[col].astype(int)
|
|
|
|
ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name")
|
|
ha_facts_and_figures = ha_facts_and_figures.fillna(0)
|
|
|
|
self.facts_and_figures = ha_facts_and_figures
|
|
|
|
|
|
def get_property_type_and_built_form(property_meta, ha_name):
|
|
if ha_name in ["HA44"]:
|
|
return None, None
|
|
|
|
if ha_name == "HA1":
|
|
property_type = property_meta["Asset Type"]
|
|
# We correct a small error
|
|
if property_type == "a":
|
|
property_type = "House"
|
|
|
|
# Remap bedsits to flats
|
|
if property_type in ["Bedsit", "Room"]:
|
|
property_type = "Flat"
|
|
|
|
built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"], None)
|
|
elif ha_name == "HA2":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA5":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA6":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]]
|
|
built_form = property_meta["built_form"]
|
|
elif ha_name == "HA7":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["Archetype"])
|
|
built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"])
|
|
elif ha_name == "HA8":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA9":
|
|
property_description = property_meta["Asset Type"].strip().lower()
|
|
if "house" in property_description:
|
|
return "House", None
|
|
|
|
if "flat" in property_description:
|
|
return "Flat", None
|
|
|
|
if "bungalow" in property_description:
|
|
return "Bungalow", None
|
|
|
|
if "maisonette" in property_description:
|
|
return "Maisonette", None
|
|
|
|
return None, None
|
|
elif ha_name == "HA11":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA12":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset_Type1"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA13":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Type Cd"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA14":
|
|
if property_meta["Asset Type Description"] == "Block - Repair":
|
|
# We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address
|
|
if "room" in property_meta["Address 1"].lower():
|
|
property_type = "House"
|
|
else:
|
|
property_type = "Flat"
|
|
|
|
else:
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][
|
|
property_meta["Asset Type Description"]
|
|
]
|
|
|
|
built_form = None
|
|
elif ha_name == "HA15":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA16":
|
|
config = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Type"]]
|
|
property_type = config.get("property-type")
|
|
built_form = config.get("built-form")
|
|
elif ha_name == "HA17":
|
|
return property_meta["property_type"], None
|
|
elif ha_name == "HA18":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA19":
|
|
property_type = property_meta["Dwelling Type"]
|
|
built_form = None
|
|
elif ha_name == "HA20":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA21":
|
|
property_description = property_meta["Property Type"].strip().lower()
|
|
if "house" in property_description:
|
|
return "House", None
|
|
|
|
if "flat" in property_description:
|
|
return "Flat", None
|
|
|
|
if "bungalow" in property_description:
|
|
return "Bungalow", None
|
|
|
|
if "maisonette" in property_description:
|
|
return "Maisonette", None
|
|
|
|
return None, None
|
|
elif ha_name == "HA24":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA25":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["T1_AssetType"]]
|
|
built_form = None
|
|
elif ha_name == "HA27":
|
|
property_type = property_meta["Property Type"]
|
|
built_form = None
|
|
elif ha_name == "HA28":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Property Type - Academy"]]
|
|
built_form = None
|
|
elif ha_name == "HA30":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["A_AssetType"]]
|
|
built_form = None
|
|
elif ha_name == "HA31":
|
|
property_description = property_meta["A_AssetType"].strip().lower()
|
|
if "house" in property_description:
|
|
return "House", None
|
|
|
|
if "flat" in property_description:
|
|
return "Flat", None
|
|
|
|
if "bungalow" in property_description:
|
|
return "Bungalow", None
|
|
|
|
if "maisonette" in property_description:
|
|
return "Maisonette", None
|
|
|
|
return None, None
|
|
|
|
elif ha_name == "HA32":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA34":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA35":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type Grouping"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA37":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PROPERTY TYPE"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA39":
|
|
property_type_config = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["ConstructionStyle"], {})
|
|
property_type = property_type_config.get("property_type", None)
|
|
built_form = property_type_config.get("built_form", None)
|
|
|
|
if property_type is None:
|
|
# We check for the presence of room or flat
|
|
if "flat" in property_meta["matching_address"]:
|
|
property_type = "Flat"
|
|
else:
|
|
property_type = "House"
|
|
elif ha_name == "HA41":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Archetype"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA42":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling use/type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA45":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA48":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA49":
|
|
property_type = property_meta["Property Class"].strip()
|
|
built_form = None
|
|
elif ha_name == "HA50":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA51":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA52":
|
|
if property_meta["Property Type"] is None:
|
|
return None, None
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA54":
|
|
property_type = property_meta["Property Type"]
|
|
built_form = None
|
|
elif ha_name == "HA56":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type Description"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA63":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PropertyType"].strip())
|
|
built_form = None
|
|
elif ha_name == "HA107":
|
|
property_type = property_meta.get("property_type", None)
|
|
built_form = property_meta.get("built_form", None)
|
|
elif ha_name == "HA117":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip())
|
|
built_form = None
|
|
elif ha_name == "HAXX":
|
|
return property_meta["Property Type"].split(":")[0].strip(), None
|
|
elif ha_name == "HAXXX":
|
|
property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Unit Description"].strip())
|
|
built_form = None
|
|
else:
|
|
raise NotImplementedError("Implement me")
|
|
|
|
return property_type, built_form
|
|
|
|
|
|
def get_epc_data(
|
|
loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
|
|
):
|
|
if not loader.data:
|
|
raise ValueError("Data not found - please run loader.load() first")
|
|
|
|
outputs = {}
|
|
for ha_name, data_assets in loader.data.items():
|
|
|
|
if not pull_data:
|
|
# Then we retrieve the data from S3
|
|
processed_ha_results = read_pickle_from_s3(
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
|
|
)
|
|
|
|
outputs[ha_name] = {
|
|
"results_df": processed_ha_results["results_df"],
|
|
"scoring_df": processed_ha_results["scoring_df"],
|
|
"nodata": processed_ha_results["nodata"]
|
|
}
|
|
continue
|
|
|
|
# For each HA, we read pull in the data required, and store in S3
|
|
asset_list = data_assets["asset_list"].copy()
|
|
|
|
# If the survey list is missing, it means we have no yet completed any surveys and therefore should only
|
|
# consider the most recent EPC
|
|
consider_penultimate_epc = data_assets["survey_list"] is not None
|
|
|
|
# We iterate through the asset list and pull what we need
|
|
results = []
|
|
scoring_data = []
|
|
nodata = []
|
|
failed_model_rows = []
|
|
for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
|
|
|
if property_meta["matching_postcode"] is None:
|
|
continue
|
|
|
|
property_type, built_form = get_property_type_and_built_form(
|
|
property_meta=property_meta, ha_name=ha_name
|
|
)
|
|
|
|
searcher = SearchEpc(
|
|
address1=str(property_meta["HouseNo"]),
|
|
postcode=property_meta["matching_postcode"],
|
|
auth_token=EPC_AUTH_TOKEN,
|
|
os_api_key="",
|
|
full_address=property_meta["matching_address"]
|
|
)
|
|
searcher.ordnance_survey_client.property_type = property_type
|
|
searcher.ordnance_survey_client.built_form = built_form
|
|
searcher.find_property(skip_os=True)
|
|
|
|
if searcher.newest_epc is None:
|
|
nodata.append(property_meta)
|
|
continue
|
|
|
|
if searcher.newest_epc.get("estimated"):
|
|
# We insert the row ID as our proxy for UPRN
|
|
searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
|
|
|
|
newest_epc = searcher.newest_epc
|
|
older_epcs = searcher.older_epcs
|
|
full_sap_epc = searcher.full_sap_epc
|
|
|
|
# If we have a survey list, we check the penultimate, because the property might have been installed
|
|
penultimate_epc = newest_epc
|
|
if consider_penultimate_epc:
|
|
# We also want to get the penultimate epc
|
|
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
|
|
if not penultimate_epc:
|
|
penultimate_epc = newest_epc
|
|
|
|
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
|
|
eligibility.check_gbis_warmfront()
|
|
eligibility.check_eco4_warmfront()
|
|
|
|
# We check the conditions for checking the penultimate epc
|
|
identified_for_gbis = property_meta["ECO Eligibility"] in ["gbis"]
|
|
identified_for_eco4 = property_meta["ECO Eligibility"] in ["eco4"]
|
|
subject_to_ciga = property_meta["ECO Eligibility"] in [
|
|
"eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"
|
|
]
|
|
|
|
# condition 1 - identified for gbis and not eligible
|
|
condition_1 = (identified_for_gbis and not eligibility.gbis_warmfront
|
|
and not eligibility.eco4_warmfront["eligible"]
|
|
) & consider_penultimate_epc
|
|
|
|
# condition 2 - identified for eco4 and not eligible
|
|
condition_2 = (identified_for_eco4 and not eligibility.eco4_warmfront[
|
|
"eligible"]) & consider_penultimate_epc
|
|
|
|
# successfully identigied gbis
|
|
condition_3 = (
|
|
identified_for_gbis and (eligibility.gbis_warmfront or eligibility.eco4_warmfront["eligible"])
|
|
)
|
|
|
|
# Nothing identified
|
|
condition_4 = (
|
|
not identified_for_gbis
|
|
and not identified_for_eco4
|
|
and not eligibility.gbis_warmfront
|
|
and not subject_to_ciga
|
|
and not eligibility.eco4_warmfront["eligible"]
|
|
)
|
|
|
|
# Not identified but seemingly eligible for eco4 or gbis
|
|
condition_5 = (
|
|
not identified_for_gbis and not identified_for_eco4 and (
|
|
eligibility.eco4_warmfront["eligible"] or eligibility.gbis_warmfront
|
|
)
|
|
)
|
|
|
|
condition_6 = (
|
|
subject_to_ciga and not eligibility.eco4_warmfront["eligible"]
|
|
)
|
|
|
|
if condition_1 or condition_2:
|
|
# We check the penultimate epc
|
|
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
|
|
eligibility.check_gbis_warmfront()
|
|
eligibility.check_eco4_warmfront()
|
|
# If this is the case, we need to update the older epcs
|
|
# We don't update just to make data cleaning easier
|
|
if penultimate_epc.get("estimated") is None:
|
|
older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]]
|
|
elif condition_3 or condition_4 or condition_5 or condition_6:
|
|
pass
|
|
else:
|
|
NotImplementedError("Implement me")
|
|
|
|
# If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity
|
|
# Loft MUST be suitable
|
|
cavity_age = None
|
|
if (
|
|
identified_for_eco4 and not eligibility.eco4_warmfront["eligible"]
|
|
):
|
|
# We check the age of the cavity and if it's particularly old, we flag it
|
|
cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned)
|
|
|
|
if eligibility.eco4_warmfront["eligible"]:
|
|
if eligibility.epc["uprn"] == "":
|
|
eligibility.epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
|
|
try:
|
|
scoring_dictionary = prepare_model_data_row(
|
|
property_id=property_meta["asset_list_row_id"],
|
|
modelling_epc=eligibility.epc,
|
|
cleaned=cleaned,
|
|
cleaning_data=cleaning_data,
|
|
created_at=created_at,
|
|
old_data=older_epcs,
|
|
full_sap_epc=full_sap_epc,
|
|
photo_supply_lookup=photo_supply_lookup,
|
|
floor_area_decile_thresholds=floor_area_decile_thresholds
|
|
)
|
|
scoring_data.extend(scoring_dictionary)
|
|
except Exception as e:
|
|
# If we fail, we just keep a record of it
|
|
failed_model_rows.append(
|
|
property_meta["asset_list_row_id"]
|
|
)
|
|
|
|
results.append(
|
|
{
|
|
"row_id": property_meta["asset_list_row_id"],
|
|
"uprn": eligibility.epc["uprn"],
|
|
"is_estimated": searcher.newest_epc.get("estimated") is not None,
|
|
"property_type": eligibility.epc["property-type"],
|
|
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
|
|
"eco4_message": eligibility.eco4_warmfront["message"],
|
|
"eco4_strict": eligibility.eco4_warmfront["strict"],
|
|
"gbis_eligible": eligibility.gbis_warmfront["eligible"],
|
|
"gbis_message": eligibility.gbis_warmfront["message"],
|
|
"gbis_strict": eligibility.gbis_warmfront["strict"],
|
|
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
|
# Property components
|
|
"roof": eligibility.roof["clean_description"],
|
|
"walls": eligibility.walls["clean_description"],
|
|
"cavity_type": eligibility.cavity["type"],
|
|
"heating": eligibility.epc["mainheat-description"],
|
|
"tenure": eligibility.tenure,
|
|
"date_epc": eligibility.epc["lodgement-date"],
|
|
"loft_thickness": eligibility.roof["insulation_thickness"],
|
|
"cavity_age": cavity_age,
|
|
"eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"],
|
|
"eligibility_loft_type": eligibility.eco4_warmfront["loft_type"]
|
|
}
|
|
)
|
|
|
|
results_df = pd.DataFrame(results)
|
|
scoring_df = pd.DataFrame(scoring_data)
|
|
results_df["post_install_sap"] = None
|
|
results_df["eligibility_classification"] = None
|
|
|
|
if not scoring_df.empty:
|
|
scoring_df = scoring_df.drop(
|
|
columns=[
|
|
"rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
|
|
"carbon_ending"
|
|
]
|
|
)
|
|
|
|
model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
|
|
model_api.MODEL_PREFIXES = ["sap_change_predictions"]
|
|
|
|
scoring_df["id"] = scoring_df["id"] + "phase=0"
|
|
# We split up the scoring_df and score
|
|
predictions = []
|
|
to_loop_over = range(0, scoring_df.shape[0], 400)
|
|
for chunk in tqdm(to_loop_over, total=len(to_loop_over)):
|
|
predictions_dict = model_api.predict_all(
|
|
df=scoring_df.iloc[chunk:chunk + 400],
|
|
bucket="retrofit-data-dev",
|
|
prediction_buckets={
|
|
"sap_change_predictions": "retrofit-sap-predictions-dev",
|
|
}
|
|
)
|
|
|
|
predictions.append(predictions_dict["sap_change_predictions"])
|
|
|
|
predictions = pd.concat(predictions)
|
|
predictions_size = predictions.shape[0]
|
|
|
|
predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
|
|
results_df[["row_id", "sap"]], how="left", on="row_id"
|
|
)
|
|
if predictions.shape[0] != predictions_size:
|
|
raise ValueError("Predictions size has changed")
|
|
predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
|
|
|
|
results_df = results_df.merge(
|
|
predictions[["sap_uplift", "row_id"]],
|
|
how="left",
|
|
on="row_id"
|
|
)
|
|
results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
|
|
|
|
eligibility_assessment = []
|
|
for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
|
|
# The upgrade requirements are dependent on the current SAP
|
|
|
|
# If the property is an F or G, it only needs to upgrade to an %
|
|
if row["sap"] <= 38:
|
|
if row["post_install_sap"] >= 57:
|
|
eligibility_classification = "highest confidence"
|
|
elif row["post_install_sap"] >= 55:
|
|
eligibility_classification = "high confidence"
|
|
elif row["post_install_sap"] >= 53:
|
|
eligibility_classification = "medium confidence"
|
|
else:
|
|
eligibility_classification = "unlikely"
|
|
else:
|
|
|
|
if row["post_install_sap"] >= 71:
|
|
eligibility_classification = "highest confidence"
|
|
elif row["post_install_sap"] >= 69:
|
|
eligibility_classification = "high confidence"
|
|
elif row["post_install_sap"] >= 67:
|
|
eligibility_classification = "medium confidence"
|
|
else:
|
|
eligibility_classification = "unlikely"
|
|
|
|
eligibility_assessment.append(
|
|
{
|
|
"row_id": row["row_id"],
|
|
"eligibility_classification": eligibility_classification
|
|
}
|
|
)
|
|
|
|
eligibility_assessment = pd.DataFrame(eligibility_assessment)
|
|
|
|
# Make sure the results haven't changed in size
|
|
results_df = results_df.merge(
|
|
eligibility_assessment, how="left", on="row_id"
|
|
)
|
|
if results_df.shape[0] != len(results):
|
|
raise ValueError("results has changed size")
|
|
|
|
# We store the results in S3 as a pickle
|
|
save_pickle_to_s3(
|
|
data={
|
|
"results_df": results_df,
|
|
"scoring_df": scoring_df,
|
|
"nodata": nodata
|
|
},
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
|
|
)
|
|
|
|
outputs[ha_name] = {
|
|
"results_df": results_df,
|
|
"scoring_df": scoring_df,
|
|
"nodata": nodata
|
|
}
|
|
|
|
return outputs
|
|
|
|
|
|
def get_col_widths(dataframe):
|
|
# Define a maximum width for any column to prevent excessively wide columns
|
|
max_allowed_width = 25
|
|
|
|
# Calculate widths for columns
|
|
widths = []
|
|
|
|
if isinstance(dataframe.columns, pd.MultiIndex):
|
|
# For MultiIndex, calculate max width considering the header and data
|
|
header_widths = [max(len(str(item)) for item in col) + 2 for col in dataframe.columns.values] # +2 for padding
|
|
for i, column in enumerate(dataframe.columns):
|
|
max_data_width = max(dataframe[column].astype(str).apply(len).max(), header_widths[i])
|
|
widths.append(min(max_data_width, max_allowed_width))
|
|
else:
|
|
# For non-MultiIndex, calculate width normally
|
|
for col in dataframe.columns:
|
|
# Calculate the max length of data or column name and limit it
|
|
max_length = max(dataframe[col].astype(str).apply(len).max(), len(str(col)) + 2) # +2 for padding
|
|
widths.append(min(max_length, max_allowed_width))
|
|
|
|
return widths
|
|
|
|
|
|
# def analyse_ha_data(outputs, loader):
|
|
# """
|
|
# The approach we take within this function is the following:
|
|
# For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The
|
|
# characterisation can be broken down as the following:
|
|
# 1) The property has been identified by Warmfront and is eligible for ECO4/GBIS work, under the strictest criteria
|
|
# 2) The property has been identified by Warmfront, however it has a full cavity, and therefore would be subject to
|
|
# a CIGA check
|
|
# 3) The property has been identified by Warmfront, but the EPC shows that the property has more than 100mm loft
|
|
# insulation
|
|
# 4) The property has been identified by Warmfront, but doesn't look like a property that would likely qualify under
|
|
# any cirsumstances, given the available data
|
|
#
|
|
# Then, for any property that has NOT been identifid by Warmfront, we identify properties that look like they would
|
|
# qualify under the strictest criteria, and mark these as potential additional opportunities.
|
|
#
|
|
# :return:
|
|
# """
|
|
#
|
|
# eco4_rate = 1710
|
|
# gbis_rate = 600
|
|
# # old_eco4_rate = 1456
|
|
# old_gbis_rate = 432
|
|
#
|
|
# epc_c_threshold = 80
|
|
# scheme_map = {
|
|
# "ECO4": "ECO4",
|
|
# "AFFORDABLE WARMTH": "ECO4",
|
|
# "ECO4 A/W": "ECO4",
|
|
# "ECO4 GBIS (ECO+)": "GBIS"
|
|
# }
|
|
#
|
|
# ha_analysis_results = []
|
|
# total_revenue_results = []
|
|
# for ha_name, datasets in outputs.items():
|
|
# inputs = [x for k, x in loader.data.items() if k == ha_name][0]
|
|
#
|
|
# results_df = datasets["results_df"].copy()
|
|
#
|
|
# analysis_data = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility"]].rename(
|
|
# columns={"row_meaning": "asset_identification_status"}
|
|
# ).merge(
|
|
# results_df,
|
|
# how="left",
|
|
# right_on="row_id",
|
|
# left_on="asset_list_row_id"
|
|
# )
|
|
#
|
|
# analysis_data["is_remaining"] = True
|
|
#
|
|
# n_sold_eco4 = 0
|
|
# n_sold_gbis = 0
|
|
# if not inputs["survey_list"].empty:
|
|
# # Merge on the survey list and signal everything that is remaining or not (i.e. anything that hasn't had
|
|
# # a survey)
|
|
# survey_list = inputs["survey_list"].copy()
|
|
#
|
|
# # TODO: TEMP
|
|
# scheme_column = survey_list.columns[0]
|
|
# # We clean up the survey list installation or cancelled
|
|
# survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower()
|
|
# # Remove all punctuation
|
|
# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
|
|
# r'[^\w\s]', '', regex=True
|
|
# )
|
|
# # Remove double spaces
|
|
# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
|
|
# r'\s+', ' ', regex=True
|
|
# )
|
|
# # Remove trailing spaces
|
|
# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip()
|
|
#
|
|
# # Remap the values in the scheme column
|
|
# survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map)
|
|
#
|
|
# survey_list["installation_status"] = None
|
|
# survey_list["installation_status"] = np.where(
|
|
# survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]),
|
|
# "installed",
|
|
# survey_list["installation_status"]
|
|
# )
|
|
# survey_list["installation_status"] = np.where(
|
|
# survey_list["installed_or_cancelled_clean"].isin(["cancelled"]),
|
|
# "cancelled",
|
|
# survey_list["installation_status"]
|
|
# )
|
|
# # Find partial installations
|
|
# survey_list["installation_status"] = np.where(
|
|
# survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"),
|
|
# "partially installed",
|
|
# survey_list["installation_status"]
|
|
# )
|
|
# # Find partial cancellations
|
|
# # TODO: We might have more indications of partial cancellations
|
|
# survey_list["installation_status"] = np.where(
|
|
# survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]),
|
|
# "partially cancelled",
|
|
# survey_list["installation_status"]
|
|
# )
|
|
#
|
|
# # Finally, for other cases, we set the status to "in progress"
|
|
# survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress")
|
|
#
|
|
# # We concatenate the scheme name with the installation status
|
|
# survey_list["installation_status"] = (
|
|
# survey_list[scheme_column] + " - " + survey_list["installation_status"]
|
|
# )
|
|
#
|
|
# # TODO: END TEMP
|
|
#
|
|
# survey_list_to_merge = survey_list[["asset_list_row_id", scheme_column]].copy()
|
|
# survey_list_to_merge["is_remaining"] = False
|
|
# analysis_data = analysis_data.drop(columns="is_remaining").merge(
|
|
# survey_list_to_merge,
|
|
# how="left", on="asset_list_row_id"
|
|
# )
|
|
# analysis_data["is_remaining"] = analysis_data["is_remaining"].fillna(True)
|
|
#
|
|
# n_sold_eco4 = survey_list_to_merge[survey_list_to_merge[scheme_column] == "ECO4"].shape[0]
|
|
# n_sold_gbis = survey_list_to_merge[survey_list_to_merge[scheme_column] == "GBIS"].shape[0]
|
|
#
|
|
# # Take just remaining
|
|
# analysis_data = analysis_data[analysis_data["is_remaining"]]
|
|
#
|
|
# # Also, if the HA has started selling, we remove any that are still subject to ciga
|
|
# n_eco4_missed_subject_to_ciga = 0
|
|
# if not inputs["survey_list"].empty:
|
|
# n_eco4_missed_subject_to_ciga = (analysis_data["ECO Eligibility"] == "eco4 (subject to ciga)").sum()
|
|
# analysis_data = analysis_data[analysis_data["ECO Eligibility"] != "eco4 (subject to ciga)"]
|
|
#
|
|
# ################################################################################################
|
|
# # We take the properties that strictly qualified under eco
|
|
# ################################################################################################
|
|
#
|
|
# eco4_identified = analysis_data[analysis_data["ECO Eligibility"] == "eco4"].copy()
|
|
# eco4_identified["identification_type"] = None
|
|
# eco4_identified["identification_type"] = np.where(
|
|
# (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == True),
|
|
# "strict",
|
|
# eco4_identified["identification_type"]
|
|
# )
|
|
#
|
|
# # For expansive, the property can be no higher than an EPC C
|
|
# eco4_identified["identification_type"] = np.where(
|
|
# (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False) & (
|
|
# eco4_identified["sap"] <= epc_c_threshold
|
|
# ),
|
|
# "expansive",
|
|
# eco4_identified["identification_type"]
|
|
# )
|
|
# ################################################################################################
|
|
# # We take the properties dependent on CIGA
|
|
# ################################################################################################
|
|
#
|
|
# ciga_dependent_identified = analysis_data[
|
|
# analysis_data["ECO Eligibility"].isin(
|
|
# [
|
|
# "eco4 (subject to ciga)",
|
|
# "eco4 - passed ciga"
|
|
# ]
|
|
# )
|
|
# ].copy()
|
|
#
|
|
# # These are properties that show filled cavity
|
|
# ciga_dependent_identified["identification_type"] = None
|
|
# ciga_dependent_identified["identification_type"] = np.where(
|
|
# ciga_dependent_identified["eco4_message"].isin(
|
|
# [
|
|
# "Perfect suitability",
|
|
# "Meets cavity and sap",
|
|
# "Fails cavity, meets loft, fails SAP",
|
|
# "Meets fabric, fails SAP check",
|
|
# "Meets cavity, loft borderline, meets sap",
|
|
# ]
|
|
# ) & (ciga_dependent_identified["sap"] <= epc_c_threshold),
|
|
# "strict",
|
|
# ciga_dependent_identified["identification_type"]
|
|
# )
|
|
#
|
|
# ciga_dependent_identified["identification_type"] = np.where(
|
|
# ((ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | (
|
|
# ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"])
|
|
# )) & (
|
|
# (ciga_dependent_identified["sap"] <= epc_c_threshold) &
|
|
# pd.isnull(ciga_dependent_identified["identification_type"])
|
|
# ),
|
|
# "expansive",
|
|
# ciga_dependent_identified["identification_type"]
|
|
# )
|
|
#
|
|
# ################################################################################################
|
|
# # We properties that qualified for gbis
|
|
# ################################################################################################
|
|
# gbis_identified = analysis_data[analysis_data["ECO Eligibility"] == "gbis"].copy()
|
|
# gbis_identified["identification_type"] = None
|
|
# gbis_identified["identification_type"] = np.where(
|
|
# (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] < 69),
|
|
# "strict",
|
|
# gbis_identified["identification_type"]
|
|
# )
|
|
#
|
|
# gbis_identified["identification_type"] = np.where(
|
|
# (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] <= epc_c_threshold) & (
|
|
# pd.isnull(gbis_identified["identification_type"])
|
|
# ),
|
|
# "expansive",
|
|
# gbis_identified["identification_type"]
|
|
# )
|
|
#
|
|
# # Finally, we look at the properties that have not been identified by Warmfront
|
|
# not_identified = analysis_data[
|
|
# analysis_data["ECO Eligibility"].isin(
|
|
# [
|
|
# "not eligible"
|
|
# ]
|
|
# )
|
|
# ].copy()
|
|
#
|
|
# surplus_eco4 = not_identified[
|
|
# (not_identified["eco4_eligible"] == True) & (not_identified["eco4_message"].isin(
|
|
# ["Perfect suitability", "Meets cavity, loft borderline, meets sap", "Near perfect suitability"]
|
|
# ))
|
|
# ]
|
|
#
|
|
# surplus_gbis = not_identified[
|
|
# (not_identified["gbis_eligible"] == True) & (
|
|
# ~not_identified["asset_list_row_id"].isin(surplus_eco4["asset_list_row_id"].values)
|
|
# ) & (not_identified["sap"] < 69) & (
|
|
# (not_identified["cavity_type"].isin(["empty", "partial insulation"])) | (
|
|
# not_identified["walls"].str.contains("partial", case=False, na=False)
|
|
# )
|
|
# )
|
|
# ]
|
|
# surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False]
|
|
#
|
|
# # Output variables - the data was sent to us in December, but the remaining figures are
|
|
# # what was in November
|
|
# november_remaining = loader.december_figures[loader.december_figures["HA Name"] == ha_name]
|
|
#
|
|
# # ECO4
|
|
# n_properties_remaining_in_asset_list = inputs["asset_list"].shape[0]
|
|
# november_eco4_remaining = max(november_remaining["ECO4 remaining"].values[0], 0)
|
|
# november_eco4_sold = november_remaining["No. of Tech surveys complete - Eco 4"].values[0]
|
|
# eco4_sales_since_november = n_sold_eco4 - november_eco4_sold
|
|
#
|
|
# n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0]
|
|
# eco4_of_which_identified_strict = (
|
|
# eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] +
|
|
# ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "strict"].shape[0]
|
|
# )
|
|
# eco4_of_which_identified_expansive = (
|
|
# eco4_identified[eco4_identified["identification_type"] == "expansive"].shape[0] +
|
|
# ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "expansive"].shape[0]
|
|
# )
|
|
# # GBIS
|
|
# n_warmfront_identified_gbis = gbis_identified.shape[0]
|
|
# november_gbis_remaining = max(november_remaining["GBIS remaining"].values[0], 0)
|
|
# november_gbis_sold = november_remaining["No. of Tech surveys complete - GBIS"].values[0]
|
|
# gbis_sales_since_november = n_sold_gbis - november_gbis_sold
|
|
# gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0]
|
|
# gbis_of_which_identified_expansive = \
|
|
# gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0]
|
|
#
|
|
# to_append = {
|
|
# ("", "HA Name"): ha_name,
|
|
# ("", "# properties in asset list"): n_properties_remaining_in_asset_list,
|
|
# ############
|
|
# # ECO4
|
|
# ############
|
|
# ("ECO4", "# remaining November file"): november_eco4_remaining,
|
|
# ("ECO4", "# sold in November file"): november_eco4_sold,
|
|
# ("ECO4", "# sold (survey list)"): n_sold_eco4,
|
|
# ("ECO4", "# that missed CIGA check"): n_eco4_missed_subject_to_ciga,
|
|
# ("ECO4", "# Remaining properties (asset list)"): n_warmfront_identified_eco4,
|
|
# ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict,
|
|
# ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive,
|
|
# ("ECO4", "Of which identified by model - total"): (
|
|
# eco4_of_which_identified_strict + eco4_of_which_identified_expansive
|
|
# ),
|
|
# ("ECO4", "Additional properties"): surplus_eco4.shape[0],
|
|
# ############
|
|
# # GBIS
|
|
# ############
|
|
# ("GBIS", "# remaining November file"): november_gbis_remaining,
|
|
# ("GBIS", "# sold in November file"): november_gbis_sold,
|
|
# ("GBIS", "# sold (survey list)"): n_sold_gbis,
|
|
# ("GBIS", "# Remaining properties (asset list)"): n_warmfront_identified_gbis,
|
|
# ("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict,
|
|
# ("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive,
|
|
# ("GBIS", "Of which identified by model - total"): (
|
|
# gbis_of_which_identified_strict + gbis_of_which_identified_expansive
|
|
# ),
|
|
# ("GBIS", "Additional properties"): surplus_gbis.shape[0]
|
|
# }
|
|
#
|
|
# ha_analysis_results.append(to_append)
|
|
#
|
|
# # Calculate the revenue results
|
|
# to_append_revenue = {
|
|
# ("", "HA Name"): ha_name,
|
|
# # Eco4 revenue
|
|
# ("ECO4", "£ remaining November file"): november_eco4_remaining * eco4_rate,
|
|
# ("ECO4", "£ sold November file"): november_eco4_sold * old_eco4_rate,
|
|
# ("ECO4", "£ sold since November"): eco4_sales_since_november * eco4_rate,
|
|
# ("ECO4", "£ stuck at ciga check"): n_eco4_missed_subject_to_ciga * eco4_rate,
|
|
# ("ECO4", "£ remaining (asset list)"): n_warmfront_identified_eco4 * eco4_rate,
|
|
# ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict * eco4_rate,
|
|
# ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive * eco4_rate,
|
|
# ("ECO4", "Of which identified by model - total"): eco4_rate * (
|
|
# eco4_of_which_identified_strict + eco4_of_which_identified_expansive
|
|
# ),
|
|
# ("ECO4", "Additional properties"): eco4_rate * surplus_eco4.shape[0],
|
|
# }
|
|
# total_revenue_results.append(to_append_revenue)
|
|
#
|
|
# ha_analysis_results = pd.DataFrame(ha_analysis_results)
|
|
# ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns)
|
|
#
|
|
# facts_and_figures = loader.facts_and_figures.copy()
|
|
# facts_and_figures["ha_number"] = facts_and_figures["HA Name"].str.extract(r'(\d+)').astype(int)
|
|
# facts_and_figures = facts_and_figures.sort_values("ha_number")
|
|
# facts_and_figures = facts_and_figures.drop(columns=["ha_number"])
|
|
#
|
|
# # Rename some of the cols
|
|
# facts_and_figures = facts_and_figures.rename(
|
|
# columns={
|
|
# # ECO4 cols
|
|
# "ECO4": "ECO4 - November",
|
|
# "GBIS": "GBIS - November",
|
|
# "eco4 (subject to ciga)": "ECO4 - subject to ciga",
|
|
# "eco4": "ECO4 - doesn't need CIGA",
|
|
# "eco4 - passed ciga": "ECO4 - passed CIGA",
|
|
# "failed ciga": "ECO4 - failed CIGA",
|
|
# "ECO4 - partially cancelled": "ECO4 - Install downgrade to GBIS",
|
|
# "ECO4 - in progress": "ECO4 - Install in progress",
|
|
# "ECO4 - cancelled": "ECO4 - Install cancelled",
|
|
# # GBIS cols
|
|
# "gbis": "GBIS total (asset list)"
|
|
# }
|
|
# )
|
|
# # We calculate the eco4 total from the asset list
|
|
# # 1) If ciga checks have been completed (i.e. ECO4 - passed ciga > 0) this sum is
|
|
# # ECO4 - doesn't need CIGA + ECO4 - passed CIGA
|
|
# # 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is
|
|
# # ECO4 - doesn't need CIGA + ECO4 - subject to ciga
|
|
# facts_and_figures["ECO4 total (asset list - pre ciga)"] = (
|
|
# facts_and_figures["ECO4 - doesn't need CIGA"] +
|
|
# facts_and_figures["ECO4 - subject to ciga"] +
|
|
# facts_and_figures["ECO4 - passed CIGA"]
|
|
# )
|
|
#
|
|
# facts_and_figures["ECO4 total (asset list - post ciga)"] = None
|
|
# facts_and_figures["ECO4 total (asset list - post ciga)"] = np.where(
|
|
# facts_and_figures["ECO4 - passed CIGA"] > 0,
|
|
# facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"],
|
|
# facts_and_figures["ECO4 total (asset list - post ciga)"]
|
|
# )
|
|
#
|
|
# # Re-arrange the columns
|
|
# facts_and_figures = facts_and_figures[
|
|
# [
|
|
# 'HA Name',
|
|
# 'ECO4 - November',
|
|
# 'GBIS - November',
|
|
# 'ECO4 total (asset list - pre ciga)',
|
|
# 'ECO4 total (asset list - post ciga)',
|
|
# 'GBIS total (asset list)',
|
|
# 'ECO4 - subject to ciga',
|
|
# "ECO4 - doesn't need CIGA",
|
|
# 'ECO4 - passed CIGA',
|
|
# 'ECO4 - failed CIGA',
|
|
# 'ECO4 - installed',
|
|
# 'ECO4 - Install in progress',
|
|
# 'ECO4 - Install cancelled',
|
|
# 'ECO4 - partially installed',
|
|
# 'ECO4 - Install downgrade to GBIS',
|
|
# ]
|
|
# ]
|
|
# # Addd a note to flag any rows where ECO4 (
|
|
# # subject to ciga is greater than 0) and (ECO4 - passed ciga is greater than 0
|
|
# # )
|
|
# facts_and_figures["Missed CIGA checks opportunity"] = None
|
|
# facts_and_figures["Missed CIGA checks opportunity"] = np.where(
|
|
# (facts_and_figures["ECO4 - subject to ciga"] > 0) & (facts_and_figures["ECO4 - passed CIGA"] > 0),
|
|
# "potential opportunity of " + facts_and_figures["ECO4 - subject to ciga"].astype(
|
|
# str) + " ECO4 properties needing a CIGA check",
|
|
# facts_and_figures["Missed CIGA checks opportunity"]
|
|
# )
|
|
#
|
|
# facts_and_figures.to_csv("Facts and figures sample.csv")
|
|
#
|
|
# # Re arrage the columns
|
|
#
|
|
# # Also sort ha_analysis_results by ha number
|
|
# ha_analysis_results["ha_number"] = ha_analysis_results[("", "HA Name")].str.extract(r'(\d+)').astype(int)
|
|
# ha_analysis_results = ha_analysis_results.sort_values("ha_number")
|
|
# ha_analysis_results = ha_analysis_results.drop(columns=["ha_number"])
|
|
#
|
|
# # We save 2 sheets
|
|
# # Automate creation of the excel
|
|
# # Create a Pandas Excel writer using XlsxWriter as the engine
|
|
# with pd.ExcelWriter('HA Analysis Results.xlsx', engine='xlsxwriter') as writer:
|
|
# # Write each dataframe to a different worksheet without the index
|
|
# for df, sheet in [(facts_and_figures, 'HA Facts and Figures'),
|
|
# (ha_analysis_results, 'Asset Identification')]:
|
|
#
|
|
# df.to_excel(writer, sheet_name=sheet)
|
|
#
|
|
# # Auto-adjust columns' width
|
|
# for i, width in enumerate(get_col_widths(df)):
|
|
# writer.sheets[sheet].set_column(i, i, width)
|
|
#
|
|
# # Inspection: - Looking into the proportion of homes with "cavity, as built, insulated (assumed)" as their
|
|
# # description, and what proportion of time they get identified via non-invasive surveys
|
|
#
|
|
# # true_eco4_assets = []
|
|
# # ciga_dependent_assets = []
|
|
# # not_eligible = []
|
|
# # as_built_insulated = []
|
|
# # date_cols = {
|
|
# # "HA39": "date_built",
|
|
# # "HA14": "Built In Year",
|
|
# # "HA6": "Construction Year",
|
|
# # "HA1": "Build Date",
|
|
# # "HA107": "YEAR BUILT"
|
|
# # }
|
|
# # for ha_name, data_objects in outputs.items():
|
|
# # inputs = [x for k, x in loader.data.items() if k == ha_name][0]
|
|
# #
|
|
# # date_col = date_cols[ha_name]
|
|
# # results_df = data_objects["results_df"].copy()
|
|
# # df = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility", date_col]].rename(
|
|
# # columns={"row_meaning": "asset_identification_status", date_col: "date_built"}
|
|
# # ).merge(
|
|
# # results_df,
|
|
# # how="left",
|
|
# # right_on="row_id",
|
|
# # left_on="asset_list_row_id"
|
|
# # )
|
|
# #
|
|
# # # take the true ECO4
|
|
# # true_eco4 = df[df["ECO Eligibility"] == "eco4"].copy()
|
|
# # ciga_dependent = df[
|
|
# # df["ECO Eligibility"].isin(
|
|
# # [
|
|
# # "eco4 (subject to ciga)",
|
|
# # "failed ciga",
|
|
# # "eco4 - passed ciga"
|
|
# # ]
|
|
# # )
|
|
# # ]
|
|
# # insulated_assumed = df[df["walls"] == "Cavity wall, as built, insulated"].copy()
|
|
# # # We convert date built to datetime
|
|
# # try:
|
|
# # insulated_assumed = insulated_assumed[~pd.isnull(insulated_assumed["date_built"])]
|
|
# # insulated_assumed["year_built"] = pd.to_datetime(insulated_assumed["date_built"].astype(str)).dt.year
|
|
# # as_built_insulated.append(insulated_assumed)
|
|
# # except Exception as e:
|
|
# # print("oh well")
|
|
# #
|
|
# # true_eco4_assets.append(true_eco4)
|
|
# # ciga_dependent_assets.append(ciga_dependent)
|
|
# #
|
|
# # true_eco4_assets = pd.concat(true_eco4_assets)
|
|
# # ciga_dependent_assets = pd.concat(ciga_dependent_assets)
|
|
# # as_built_insulated = pd.concat(as_built_insulated)
|
|
# #
|
|
# # true_eco4_assets["walls"].value_counts(normalize=True)
|
|
# # ciga_dependent_assets["walls"].value_counts(normalize=True)
|
|
# #
|
|
# # from recommendations.recommendation_utils import extract_insulation_thickness
|
|
# #
|
|
# # true_eco4_assets["roof_insulation_thickness"] = true_eco4_assets["roof"].apply(
|
|
# # lambda x: extract_insulation_thickness(x)
|
|
# # )
|
|
# #
|
|
# # true_eco4_assets["e"] = true_eco4_assets.merge(
|
|
# # pd.DataFrame(cleaned["roof-description"])[["original_description", "insulation_thickness"]],
|
|
# # how="left",
|
|
# # left_on="roof",
|
|
# # right_on="original_description"
|
|
# # )
|
|
# #
|
|
# # true_eco4_assets["sap"].mean()
|
|
# #
|
|
# # true_eco4_assets["insulation_thickness"].isin(
|
|
# # ["250", "150", "200", "100", "75", "50"]
|
|
# # ).sum() / true_eco4_assets.shape[0]
|
|
# #
|
|
# # true_eco4_assets["insulation_thickness"].isin(
|
|
# # ["100"]
|
|
# # ).sum() / true_eco4_assets.shape[0]
|
|
# #
|
|
# # as_built_insulated.groupby("property_type")["ECO Eligibility"].value_counts(normalize=True)
|
|
|
|
|
|
def get_propensity_model_data(
|
|
loader, cleaned, cleaning_data, created_at, photo_supply_lookup,
|
|
floor_area_decile_thresholds, pull_data=True
|
|
):
|
|
# TODO: Set a seed!
|
|
model_data = []
|
|
for ha_name, data_assets in loader.data.items():
|
|
|
|
logger.info("Processing HA: %s", ha_name)
|
|
if data_assets["survey_list"].empty:
|
|
continue
|
|
|
|
number_sold = data_assets["survey_list"].shape[0]
|
|
|
|
# For each HA, we read pull in the data required, and store in S3
|
|
asset_list = data_assets["asset_list"].copy()
|
|
# We determine the number of properties that we should select that are eligible
|
|
asset_list_size = asset_list.shape[0]
|
|
# Number eligible
|
|
n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0]
|
|
success_rate = n_eligibile / asset_list_size
|
|
needed_sample_size = np.ceil(number_sold / success_rate)
|
|
number_negative_samples = int(needed_sample_size - number_sold)
|
|
|
|
sold_asset_list_ids = data_assets["survey_list"]["asset_list_row_id"].tolist()
|
|
negative_sample_asset_list_ids = asset_list["asset_list_row_id"].sample(number_negative_samples).tolist()
|
|
sample_ids = sold_asset_list_ids + negative_sample_asset_list_ids
|
|
|
|
sample_asset_list = asset_list[asset_list["asset_list_row_id"].isin(sample_ids)]
|
|
|
|
# In order to have the most confidence, we should take just properties that have 1 EPC. We might need to
|
|
# cut down the number of properties that we include because of this
|
|
# Note: This is an imbalanced problem so we will need to build a model accomadating of that
|
|
|
|
data = []
|
|
errors = []
|
|
for index, property_meta in tqdm(sample_asset_list.iterrows(), total=len(sample_asset_list)):
|
|
|
|
if property_meta["matching_postcode"] is None:
|
|
continue
|
|
|
|
property_type, built_form = get_property_type_and_built_form(
|
|
property_meta=property_meta, ha_name=ha_name
|
|
)
|
|
|
|
searcher = SearchEpc(
|
|
address1=str(property_meta["HouseNo"]),
|
|
postcode=property_meta["matching_postcode"],
|
|
auth_token=EPC_AUTH_TOKEN,
|
|
os_api_key="",
|
|
full_address=property_meta["matching_address"]
|
|
)
|
|
searcher.ordnance_survey_client.property_type = property_type
|
|
searcher.ordnance_survey_client.built_form = built_form
|
|
searcher.find_property(skip_os=True)
|
|
|
|
if searcher.newest_epc is None:
|
|
continue
|
|
|
|
if searcher.newest_epc.get("estimated"):
|
|
# We insert the row ID as our proxy for UPRN
|
|
searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
|
|
|
|
newest_epc = searcher.newest_epc
|
|
older_epcs = searcher.older_epcs
|
|
full_sap_epc = searcher.full_sap_epc
|
|
|
|
# If we have more than 1 EPC for the moment we just continue
|
|
if older_epcs or full_sap_epc:
|
|
continue
|
|
try:
|
|
|
|
# We clean up the data
|
|
epc_records = {
|
|
'original_epc': newest_epc.copy(),
|
|
'full_sap_epc': full_sap_epc.copy(),
|
|
'old_data': older_epcs.copy(),
|
|
}
|
|
|
|
epc_record = EPCRecord(
|
|
epc_records=epc_records,
|
|
run_mode="newdata",
|
|
cleaning_data=cleaning_data
|
|
)
|
|
|
|
# If we have some data, continue
|
|
data.append(
|
|
{
|
|
"ECO Eligibility": property_meta["ECO Eligibility"],
|
|
"asset_list_row_id": property_meta["asset_list_row_id"],
|
|
**epc_record.get("prepared_epc")
|
|
}
|
|
)
|
|
except Exception as e:
|
|
errors.append(
|
|
{
|
|
"error": str(e),
|
|
"asset_list_row_id": property_meta["asset_list_row_id"],
|
|
"matching_postcode": property_meta["matching_postcode"],
|
|
"matching_address": property_meta["matching_address"]
|
|
}
|
|
)
|
|
|
|
data = pd.DataFrame(data)
|
|
# We store the results in S3 as a pickle
|
|
save_pickle_to_s3(
|
|
data=data,
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name=f"propensity_model_data/{ha_name}/train.pickle"
|
|
)
|
|
|
|
# Store the errors
|
|
if errors:
|
|
save_pickle_to_s3(
|
|
data=errors,
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name=f"propensity_model_data/{ha_name}/errors.pickle"
|
|
)
|
|
|
|
model_data.append(data)
|
|
|
|
return model_data
|
|
|
|
|
|
def conversion_model(loader):
|
|
# Read in the model data
|
|
|
|
model_data = []
|
|
for ha_name in loader.data.keys():
|
|
try:
|
|
picked = read_pickle_from_s3(
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name=f"propensity_model_data/{ha_name}/train.pickle"
|
|
)
|
|
data = pd.DataFrame(picked)
|
|
|
|
# We merge on the sales data
|
|
sales_data = loader.data[ha_name]["survey_list"].copy()
|
|
data = data.merge(
|
|
sales_data[["asset_list_row_id", "installation_status"]],
|
|
how="left",
|
|
on="asset_list_row_id"
|
|
)
|
|
data["ha_name"] = ha_name
|
|
|
|
except Exception as e:
|
|
logger.error("Error reading in the data for %s", ha_name)
|
|
continue
|
|
|
|
model_data.append(data)
|
|
|
|
model_data = pd.concat(model_data)
|
|
|
|
model_data["response"] = model_data["installation_status"].isin(
|
|
[
|
|
"ECO4 - in progress",
|
|
"ECO4 - installed"
|
|
]
|
|
).astype(int)
|
|
|
|
# Because of how we pulled the data, we need to re-balance the sample
|
|
ha_names = model_data["ha_name"].unique()
|
|
|
|
balanced_sample = []
|
|
for ha_name in ha_names:
|
|
df = model_data[model_data["ha_name"] == ha_name]
|
|
positive_samples = df[df["response"] == 1]
|
|
negative_samples = df[df["response"] != 1]
|
|
|
|
inputs = [x for k, x in loader.data.items() if k == ha_name][0]
|
|
asset_list = inputs["asset_list"].copy()
|
|
asset_list_size = asset_list.shape[0]
|
|
n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0]
|
|
success_rate = n_eligibile / asset_list_size
|
|
needed_sample_size = np.ceil(positive_samples.shape[0] / success_rate)
|
|
number_negative_samples = int(needed_sample_size - positive_samples.shape[0])
|
|
negative_samples_subset = negative_samples.sample(number_negative_samples)
|
|
|
|
output = pd.concat([positive_samples, negative_samples_subset])
|
|
|
|
balanced_sample.append(output)
|
|
|
|
balanced_sample = pd.concat(balanced_sample)
|
|
|
|
# We work with a small sample
|
|
# Drop the ECO Eligibility column and installation_status column
|
|
# We keep the ID column
|
|
balanced_sample = balanced_sample.drop(
|
|
columns=['ECO Eligibility', 'asset_list_row_id', 'address', 'uprn_source', 'address3', 'local_authority_label',
|
|
'county', 'postcode', 'constituency', 'local_authority', 'inspection_date', 'address1',
|
|
'constituency_label', 'building_reference_number', 'address2', 'posttown', 'lodgement_datetime',
|
|
'uprn', 'lodgement_date', 'lmk_key', 'installation_status', 'ha_name']
|
|
)
|
|
|
|
# POC model
|
|
df = balanced_sample.copy()
|
|
# FIll missings with means, if they exist
|
|
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
|
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
|
|
|
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
df[categorical_cols] = df[categorical_cols].fillna("other")
|
|
|
|
# Reduce the number of categories to a specific number and the rest to other
|
|
max_n_categories = 10
|
|
for col in categorical_cols:
|
|
top_categories = df[col].value_counts().nlargest(max_n_categories).index
|
|
df[col] = df[col].where(df[col].isin(top_categories), other="other")
|
|
|
|
# Use a model based approach to feature selection
|
|
import xgboost as xgb
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
# Assuming your outcome column is named 'target'
|
|
X = df.drop(columns=['response'])
|
|
y = df['response']
|
|
df["low_energy_fixed_light_count"].va
|
|
|
|
# Encoding categorical variables if not already done
|
|
X = pd.get_dummies(X, drop_first=True)
|
|
|
|
# Splitting the data into train and test sets
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
# Initialize an XGBoost classifier
|
|
model = xgb.XGBClassifier()
|
|
|
|
# Fit the model
|
|
model.fit(X_train, y_train)
|
|
|
|
# Get feature importances
|
|
feature_importances = model.feature_importances_
|
|
|
|
# Map feature importances to their corresponding column names
|
|
feature_importance_dict = {feature: importance for feature, importance in zip(X.columns, feature_importances)}
|
|
|
|
# Sort features by importance
|
|
sorted_features = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)
|
|
|
|
# Display sorted features
|
|
for feature, importance in sorted_features:
|
|
print(f"{feature}: {importance}")
|
|
|
|
|
|
def patch_cleaned(cleaned):
|
|
# Patch to handle the a missing description
|
|
cleaned["floor-description"].extend(
|
|
[
|
|
{'original_description': 'To external air, uninsulated (assumed)',
|
|
'clean_description': 'To external air, no insulation', 'thermal_transmittance': None,
|
|
'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': False,
|
|
'is_to_external_air': True, 'is_suspended': False, 'is_solid': False, 'another_property_below': False,
|
|
'insulation_thickness': 'none'},
|
|
{'original_description': 'To unheated space, uninsulated (assumed)',
|
|
'clean_description': 'To unheated space, uninsulated', 'thermal_transmittance': None,
|
|
'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': True,
|
|
'is_to_external_air': False, 'is_suspended': False, 'is_solid': False, 'another_property_below': False,
|
|
'insulation_thickness': 'average'}
|
|
]
|
|
)
|
|
|
|
cleaned["roof-description"].extend(
|
|
[
|
|
{'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation',
|
|
'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True,
|
|
'is_roof_room': False,
|
|
'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
|
|
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'}
|
|
]
|
|
)
|
|
|
|
cleaned["roof-description"].extend(
|
|
[
|
|
{'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation',
|
|
'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True,
|
|
'is_roof_room': False,
|
|
'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True,
|
|
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'}
|
|
]
|
|
)
|
|
|
|
cleaned["roof-description"].extend(
|
|
[
|
|
{'original_description': 'Pitched, 300+mm loft insulation',
|
|
'clean_description': 'Pitched, 300+ mm loft insulation', 'thermal_transmittance': None,
|
|
'thermal_transmittance_unit': None, 'is_pitched': True, 'is_roof_room': False, 'is_loft': True,
|
|
'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
|
|
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '300+'
|
|
}
|
|
]
|
|
)
|
|
|
|
thermal_transmittance_values = list(np.arange(0, 2, 0.01))
|
|
for ttv in thermal_transmittance_values:
|
|
ttv_roundeded = round(ttv, 2)
|
|
# We look for an instance of that thermal transmittance value
|
|
rec = [
|
|
x for x in cleaned["roof-description"] if
|
|
(x["thermal_transmittance"] == ttv_roundeded) and "Average thermal transmittance" in x["clean_description"]
|
|
]
|
|
|
|
if rec:
|
|
continue
|
|
else:
|
|
# We patch the record
|
|
cleaned["roof-description"].extend(
|
|
[{'original_description': f'Average thermal transmittance {ttv_roundeded} W/m-¦K',
|
|
'clean_description': f'Average thermal transmittance {ttv_roundeded} w/m-¦k',
|
|
'thermal_transmittance': ttv_roundeded,
|
|
'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False,
|
|
'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
|
|
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}]
|
|
)
|
|
|
|
# We also patch a funny unit value we found
|
|
for ttv in thermal_transmittance_values:
|
|
ttv_rounded = round(ttv, 2)
|
|
# We look for an instance of that thermal transmittance value
|
|
rec = [
|
|
x for x in cleaned["roof-description"] if
|
|
(x["thermal_transmittance"] == ttv_rounded) and "Average thermal transmittance" in x["clean_description"]
|
|
and x["thermal_transmittance_unit"] == "w/m?K"
|
|
]
|
|
|
|
if rec:
|
|
continue
|
|
else:
|
|
# We patch the record
|
|
ttv_string = str(ttv_rounded)
|
|
if len(ttv_string) == 3:
|
|
ttv_string = f"{ttv_string}0"
|
|
|
|
cleaned["roof-description"].extend(
|
|
[{'original_description': f'Average thermal transmittance {ttv_string} W/m?K',
|
|
'clean_description': f'Average thermal transmittance {ttv_string} w/m-¦k',
|
|
'thermal_transmittance': ttv_rounded,
|
|
'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False,
|
|
'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False,
|
|
'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}]
|
|
)
|
|
|
|
# Patch mainheatcont-description
|
|
cleaned["mainheatcont-description"].extend(
|
|
[
|
|
{'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None,
|
|
'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None,
|
|
'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None,
|
|
'rate_control': None}
|
|
]
|
|
)
|
|
|
|
# We patch this record because there is another property below
|
|
for x in cleaned["floor-description"]:
|
|
if x["original_description"] == '(Same dwelling below) insulated (assumed)':
|
|
x["another_property_below"] = True
|
|
x["thermal_transmittance"] = 0
|
|
|
|
return cleaned
|
|
|
|
|
|
def calculate_eco4_post_ciga(
|
|
eligiblity_counts, input_data, ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate,
|
|
eco4_rate, archetype_conversion_rate
|
|
):
|
|
remaining_needing_ciga_check = eligiblity_counts[
|
|
eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") &
|
|
~eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype")
|
|
]["count"].sum()
|
|
|
|
remaining_needing_ciga_and_archetype_check = eligiblity_counts[
|
|
eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") &
|
|
eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype")
|
|
]["count"].sum()
|
|
# We scale this down by the archetype_conversion_rate, and add this on to the remaining_needing_ciga_check
|
|
remaining_needing_ciga_and_archetype_check_passed = np.round(
|
|
remaining_needing_ciga_and_archetype_check * archetype_conversion_rate
|
|
)
|
|
|
|
remaining_needing_ciga_check += remaining_needing_ciga_and_archetype_check_passed
|
|
|
|
eco4_no_ciga_needed = eligiblity_counts[
|
|
eligiblity_counts["ECO Eligibility"] == "eco4"
|
|
]["count"].sum()
|
|
|
|
eco4_no_ciga_archetype_needed = eligiblity_counts[
|
|
eligiblity_counts["ECO Eligibility"] == "eco4 (subject to archetype)"
|
|
]["count"].sum()
|
|
eco4_no_ciga_archetype_needed_passed = np.round(
|
|
eco4_no_ciga_archetype_needed * archetype_conversion_rate
|
|
)
|
|
|
|
eco4_no_ciga_needed += eco4_no_ciga_archetype_needed_passed
|
|
|
|
failed_archetype_check = int(
|
|
remaining_needing_ciga_and_archetype_check +
|
|
eco4_no_ciga_archetype_needed -
|
|
remaining_needing_ciga_and_archetype_check_passed -
|
|
eco4_no_ciga_archetype_needed_passed
|
|
)
|
|
|
|
has_ciga_check = not input_data["ciga_list"].empty
|
|
if has_ciga_check:
|
|
|
|
eco4_ciga_passed = eligiblity_counts[
|
|
eligiblity_counts["ECO Eligibility"] == "eco4 - passed ciga"
|
|
]["count"].sum()
|
|
|
|
eco4_confirmed_ciga_failures = eligiblity_counts[
|
|
eligiblity_counts["ECO Eligibility"] == "failed ciga"
|
|
]["count"].sum()
|
|
|
|
eco4_no_ciga_needed_or_ciga_passed = eco4_no_ciga_needed + eco4_ciga_passed
|
|
|
|
eco4_confirmed = np.round(
|
|
(eco4_no_ciga_needed * ha_eco4_to_sale_rate) +
|
|
(eco4_ciga_passed * ha_ciga_pass_to_sale_rate)
|
|
)
|
|
|
|
eco4_no_ciga_needed_cancellations = int(eco4_no_ciga_needed_or_ciga_passed - eco4_confirmed)
|
|
|
|
if remaining_needing_ciga_check > 0:
|
|
# We update the eco4 post ciga with the converted remaining
|
|
eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate)
|
|
|
|
eco4_remaining_forecast = np.round(
|
|
eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate
|
|
)
|
|
eco4_ciga_needed_cancellations = eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast
|
|
eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass
|
|
eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast
|
|
else:
|
|
eco4_remaining_forecast = 0
|
|
eco4_estimated_ciga_failures = 0
|
|
eco4_ciga_needed_cancellations = 0
|
|
eco4_post_ciga = eco4_confirmed
|
|
|
|
eco4_expected_cancellations = eco4_no_ciga_needed_cancellations + eco4_ciga_needed_cancellations
|
|
else:
|
|
eco4_confirmed_ciga_failures = 0
|
|
# Multiply by sale conversion
|
|
eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate)
|
|
eco4_no_ciga_cancellations = int(eco4_no_ciga_needed - eco4_confirmed)
|
|
eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate)
|
|
eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass
|
|
|
|
eco4_remaining_forecast = np.round(
|
|
eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate
|
|
)
|
|
eco4_ciga_cancellations = int(eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast)
|
|
eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast
|
|
|
|
eco4_expected_cancellations = eco4_no_ciga_cancellations + eco4_ciga_cancellations
|
|
|
|
eco4_post_ciga = int(eco4_post_ciga)
|
|
eco4_remaining_forecast = int(eco4_remaining_forecast)
|
|
eco4_confirmed = int(eco4_confirmed)
|
|
|
|
results = {
|
|
# Counts
|
|
"ECO4 - post CIGA - #": eco4_post_ciga,
|
|
"Of which confirmed - #": eco4_confirmed,
|
|
"Of which forecast - #": eco4_remaining_forecast,
|
|
# Revenue
|
|
"ECO4 - post CIGA - £": eco4_post_ciga * eco4_rate,
|
|
"Of which confirmed - £": eco4_confirmed * eco4_rate,
|
|
"Of which forecast - £": eco4_remaining_forecast * eco4_rate,
|
|
# Archetype check failures
|
|
"Estimated total - failed archetype check - #": failed_archetype_check,
|
|
"Estimated total - failed archetype check - £": failed_archetype_check * eco4_rate,
|
|
# Ciga failures
|
|
"Estimated total - failed CIGA": int(eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures),
|
|
"Confirmed CIGA failures": eco4_confirmed_ciga_failures,
|
|
"Estimated CIGA failures": int(eco4_estimated_ciga_failures),
|
|
# Ciga failures cost
|
|
"Estimated total - failed CIGA - £": int(
|
|
(eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures) * eco4_rate
|
|
),
|
|
"Confirmed CIGA failures - £": int(eco4_confirmed_ciga_failures * eco4_rate),
|
|
"Estimated CIGA failures - £": int(eco4_estimated_ciga_failures * eco4_rate),
|
|
# Expected cencellations
|
|
"Expected cancellations - #": eco4_expected_cancellations,
|
|
"Expected cancellations - £": eco4_expected_cancellations * eco4_rate
|
|
}
|
|
|
|
return results
|
|
|
|
|
|
def forecast_remaining_sales(loader):
|
|
# Assumptions:
|
|
# We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate
|
|
# and I don't want the numbers to change too much, depenent on the CIGA conversation rate
|
|
maximum_ciga_conversion = 0.75
|
|
|
|
# This is a hard limit to the allowed conversion rates to final sale. These are typically very
|
|
# high but there are some anomalies, amongst surveys that are early on
|
|
sales_conversion_lower_bound = 0.8
|
|
|
|
gbis_rate = 600
|
|
eco4_rate = 1710
|
|
|
|
# Based on ONS https://www.ons.gov.uk/peoplepopulationandcommunity/housing/bulletins/housingenglandandwales
|
|
# /census2021
|
|
# there are 5.7 million terraced properties in the UK, of the 19.3 million houses or bungalows. We therefore apply
|
|
# a 30% discount to homes that are dependent on an archetype check, since around 30% of them will be mid terraced
|
|
# This 30% is slightly harsh but we be conservative
|
|
# Therefore, the archetype check conversion rate is 70%
|
|
archetype_conversion_rate = 0.7
|
|
|
|
# 1) Calculate the conversion rate from passed CIGA to actual sale
|
|
converted_ciga_jobs = []
|
|
for ha_name, input_data in loader.data.items():
|
|
asset_list = input_data["asset_list"].copy()
|
|
survey_list = input_data["survey_list"].copy()
|
|
|
|
if survey_list.empty:
|
|
continue
|
|
|
|
ciga_dependent_assets = asset_list[
|
|
asset_list["ECO Eligibility"] == "eco4 - passed ciga"
|
|
]
|
|
|
|
# These are now the ciga dependent assets at installation
|
|
ciga_dependent_assets_at_installation = ciga_dependent_assets.merge(
|
|
survey_list[["asset_list_row_id", "installation_status"]],
|
|
how="inner",
|
|
on="asset_list_row_id"
|
|
)
|
|
|
|
# We then calculate how many get cancelled
|
|
ciga_dependent_assets_sold = ciga_dependent_assets_at_installation[
|
|
ciga_dependent_assets_at_installation["installation_status"].isin(
|
|
[
|
|
"ECO4 - installed", "ECO4 - in progress"
|
|
]
|
|
)
|
|
]
|
|
|
|
ciga_dependent_assets_failed = ciga_dependent_assets_at_installation[
|
|
~ciga_dependent_assets_at_installation["installation_status"].isin(
|
|
[
|
|
"ECO4 - installed", "ECO4 - in progress"
|
|
]
|
|
)
|
|
]
|
|
|
|
converted_ciga_jobs.append(
|
|
{
|
|
"HA Name": ha_name,
|
|
"# Ciga dependent at installation": ciga_dependent_assets_at_installation.shape[0],
|
|
"# Ciga dependent successfully installed": ciga_dependent_assets_sold.shape[0],
|
|
"# Ciga dependent failed install": ciga_dependent_assets_failed.shape[0]
|
|
}
|
|
)
|
|
|
|
converted_ciga_jobs = pd.DataFrame(converted_ciga_jobs)
|
|
|
|
# We calculate a ciga pass to install conversaion rate
|
|
median_ciga_pass_to_install = (
|
|
converted_ciga_jobs["# Ciga dependent successfully installed"].sum() /
|
|
converted_ciga_jobs["# Ciga dependent at installation"].sum()
|
|
)
|
|
|
|
# 2) Calculate the conversion rate from CIGA dependent to ciga passed
|
|
ciga_passrates = []
|
|
for ha_name, input_data in loader.data.items():
|
|
|
|
# If we don't have a ciga list, we can't do anything
|
|
if input_data["ciga_list"].empty:
|
|
continue
|
|
|
|
# 1) Calculate the conversion rate for CIGA to actual sale
|
|
asset_list = input_data["asset_list"].copy()
|
|
|
|
ciga_completed_assets = asset_list[
|
|
asset_list["ECO Eligibility"].isin(
|
|
[
|
|
"eco4 - passed ciga",
|
|
"failed ciga"
|
|
]
|
|
)
|
|
]
|
|
|
|
ciga_passed = ciga_completed_assets[
|
|
ciga_completed_assets["ECO Eligibility"].isin(
|
|
[
|
|
"eco4 - passed ciga"
|
|
]
|
|
)
|
|
]
|
|
|
|
ciga_passrates.append(
|
|
{
|
|
"Ha Name": ha_name,
|
|
"# CIGA dependent": ciga_completed_assets.shape[0],
|
|
"# CIGA passed": ciga_passed.shape[0],
|
|
}
|
|
)
|
|
|
|
ciga_passrates = pd.DataFrame(ciga_passrates)
|
|
|
|
median_ciga_success_rate = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum()
|
|
|
|
# 3) Calculate the conversion rate of an ECO4 and a GBISjob, that doesn't need ciga, to install
|
|
eco4_ciga_independent_to_install = []
|
|
gbis_to_install = []
|
|
for ha_name, input_data in loader.data.items():
|
|
asset_list = input_data["asset_list"].copy()
|
|
survey_list = input_data["survey_list"].copy()
|
|
|
|
if survey_list.empty:
|
|
continue
|
|
|
|
# For properties that were identified as a typical ECO4 job, we calculate the number of properties that
|
|
# installed
|
|
# vs cancelled
|
|
|
|
typical_eco4 = asset_list[asset_list["ECO Eligibility"] == "eco4"]
|
|
typical_gbis = asset_list[asset_list["ECO Eligibility"] == "gbis"]
|
|
|
|
# Merge on the surveys
|
|
typical_eco4_installed = typical_eco4.merge(
|
|
survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id"
|
|
)
|
|
|
|
if not typical_eco4_installed.empty:
|
|
typical_eco4_sold = typical_eco4_installed[
|
|
typical_eco4_installed["installation_status"].isin(
|
|
[
|
|
"ECO4 - installed", "ECO4 - in progress"
|
|
]
|
|
)
|
|
]
|
|
|
|
eco4_ciga_independent_to_install.append(
|
|
{
|
|
"Ha Name": ha_name,
|
|
"# ECO4 at install stage": typical_eco4_installed.shape[0],
|
|
"# ECO4 successfully installed": typical_eco4_sold.shape[0]
|
|
}
|
|
)
|
|
|
|
typical_gbis_installed = typical_gbis.merge(
|
|
survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id"
|
|
)
|
|
if not typical_gbis_installed.empty:
|
|
typical_gbis_sold = typical_gbis_installed[
|
|
typical_gbis_installed["installation_status"].isin(
|
|
[
|
|
"GBIS - in progress", "GBIS - installed"
|
|
]
|
|
)
|
|
]
|
|
|
|
gbis_to_install.append(
|
|
{
|
|
"Ha Name": ha_name,
|
|
"# GBIS at install stage": typical_gbis_installed.shape[0],
|
|
"# GBIS successfully installed": typical_gbis_sold.shape[0]
|
|
}
|
|
)
|
|
|
|
eco4_ciga_independent_to_install = pd.DataFrame(eco4_ciga_independent_to_install)
|
|
gbis_to_install = pd.DataFrame(gbis_to_install)
|
|
|
|
eco4_ciga_independent_to_install["conversion"] = (
|
|
eco4_ciga_independent_to_install["# ECO4 successfully installed"] /
|
|
eco4_ciga_independent_to_install["# ECO4 at install stage"]
|
|
)
|
|
eco4_ciga_independent_to_install_clipped = eco4_ciga_independent_to_install[
|
|
eco4_ciga_independent_to_install["conversion"] >= sales_conversion_lower_bound
|
|
]
|
|
|
|
gbis_to_install["conversion"] = (
|
|
gbis_to_install["# GBIS successfully installed"] /
|
|
gbis_to_install["# GBIS at install stage"]
|
|
)
|
|
gbis_to_install_clipped = gbis_to_install[
|
|
gbis_to_install["conversion"] >= sales_conversion_lower_bound
|
|
]
|
|
|
|
median_eco4_to_install = (
|
|
eco4_ciga_independent_to_install_clipped["# ECO4 successfully installed"].sum() /
|
|
eco4_ciga_independent_to_install_clipped["# ECO4 at install stage"].sum()
|
|
)
|
|
|
|
median_gbis_to_install = (
|
|
gbis_to_install_clipped["# GBIS successfully installed"].sum() /
|
|
gbis_to_install_clipped["# GBIS at install stage"].sum()
|
|
)
|
|
|
|
# Produce the final output
|
|
december_figures = loader.december_figures.copy()
|
|
december_figures = december_figures.fillna(0)
|
|
# If we have negative remaining, it means that actually sold more gbis than they initially thought so we set
|
|
# remaining to 0
|
|
december_figures["ECO4 remaining"] = np.where(
|
|
december_figures["ECO4 remaining"] < 0, 0, december_figures["ECO4 remaining"]
|
|
)
|
|
december_figures["GBIS remaining"] = np.where(
|
|
december_figures["GBIS remaining"] < 0, 0, december_figures["GBIS remaining"]
|
|
)
|
|
|
|
results = []
|
|
for ha_name, input_data in loader.data.items():
|
|
|
|
# Original warmfront figures - ECO4
|
|
original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name]
|
|
if original_warmfront_estimates.empty:
|
|
# Append an empty row
|
|
original_warmfront_estimates = december_figures.head(1).copy()
|
|
for k in original_warmfront_estimates.columns:
|
|
original_warmfront_estimates[k] = 0
|
|
original_warmfront_estimates["HA Name"] = ha_name
|
|
|
|
original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0]
|
|
original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0]
|
|
original_warmfront_sold_eco4 = (
|
|
original_warmfront_estimates["No. of Tech surveys complete - Eco 4"].values[0] * eco4_rate
|
|
)
|
|
|
|
original_warmfront_eco4_revenue = original_warmfront_eco4 * eco4_rate
|
|
original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate
|
|
original_warmfront_sold_gbis = (
|
|
original_warmfront_estimates["No. of Tech surveys complete - GBIS"].values[0] * gbis_rate
|
|
)
|
|
|
|
# Original warmfront figures - GBIS
|
|
|
|
original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0]
|
|
original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0]
|
|
|
|
original_warmfront_gbis_revenue = (
|
|
original_warmfront_gbis * gbis_rate
|
|
)
|
|
original_warmfront_remaining_gbis_revenue = original_warmfront_remaining_gbis * gbis_rate
|
|
|
|
# Asset list - ECO4
|
|
asset_list = input_data["asset_list"].copy()
|
|
survey_list = input_data["survey_list"].copy()
|
|
|
|
if survey_list.empty:
|
|
asset_list_remaining = asset_list.copy()
|
|
else:
|
|
# For HA6, there are a small number of postcodes that do not match to any item in the asset list
|
|
survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])]
|
|
asset_list_remaining = asset_list.merge(
|
|
survey_list[["asset_list_row_id", "installation_status"]],
|
|
how="left",
|
|
on="asset_list_row_id"
|
|
)
|
|
# Anything that has an installation has gone to installation, and therefore is not remaining
|
|
asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])]
|
|
asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"])
|
|
|
|
eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index()
|
|
eligiblity_counts_remaining = pd.DataFrame(asset_list_remaining["ECO Eligibility"].value_counts()).reset_index()
|
|
|
|
eco4_pre_ciga = eligiblity_counts[
|
|
eligiblity_counts["ECO Eligibility"].isin(
|
|
[
|
|
"eco4",
|
|
"eco4 (subject to ciga)",
|
|
"eco4 - passed ciga",
|
|
"failed ciga",
|
|
"eco4 (subject to ciga) (subject to archetype)",
|
|
"eco4 (subject to archetype)"
|
|
]
|
|
)
|
|
]["count"].sum()
|
|
|
|
eco4_pre_ciga_remaining = eligiblity_counts_remaining[
|
|
eligiblity_counts_remaining["ECO Eligibility"].isin(
|
|
[
|
|
"eco4",
|
|
"eco4 (subject to ciga)",
|
|
"eco4 - passed ciga",
|
|
"failed ciga",
|
|
"eco4 (subject to ciga) (subject to archetype)",
|
|
"eco4 (subject to archetype)"
|
|
]
|
|
)
|
|
]["count"].sum()
|
|
|
|
eco4_pre_ciga_revenue = eco4_pre_ciga * eco4_rate
|
|
eco4_pre_ciga_remaining_revenue = eco4_pre_ciga_remaining * eco4_rate
|
|
|
|
# Total Eligible - this is what passed ciga checks + strict. If we don't have what passed CIGA, we estimate
|
|
# We check if the HA has done a CIGA check. Also, if we have assets dormant at CIGA, we estimate what will
|
|
# convert
|
|
# We estimate a conversion for anything left post CIGA
|
|
ha_ciga_conversion = ciga_passrates[ciga_passrates["Ha Name"] == ha_name]
|
|
if not ha_ciga_conversion.empty:
|
|
ha_ciga_conversion_rate = (
|
|
ha_ciga_conversion["# CIGA passed"].values[0] / ha_ciga_conversion["# CIGA dependent"].values[0]
|
|
)
|
|
else:
|
|
ha_ciga_conversion_rate = (
|
|
median_ciga_success_rate if median_ciga_success_rate <= maximum_ciga_conversion else
|
|
maximum_ciga_conversion
|
|
)
|
|
|
|
# We also need the ha ciga passed to install success rate
|
|
ha_ciga_pass_to_sale = converted_ciga_jobs[converted_ciga_jobs["HA Name"] == ha_name]
|
|
if not ha_ciga_pass_to_sale.empty and ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0] != 0:
|
|
ha_ciga_pass_to_sale_rate = (
|
|
ha_ciga_pass_to_sale["# Ciga dependent successfully installed"].values[0] /
|
|
ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0]
|
|
)
|
|
else:
|
|
ha_ciga_pass_to_sale_rate = median_ciga_pass_to_install
|
|
|
|
ha_eco4_to_sale = eco4_ciga_independent_to_install_clipped[
|
|
eco4_ciga_independent_to_install_clipped["Ha Name"] == ha_name
|
|
]
|
|
if not ha_eco4_to_sale.empty:
|
|
ha_eco4_to_sale_rate = (
|
|
ha_eco4_to_sale['# ECO4 successfully installed'].values[0] /
|
|
ha_eco4_to_sale['# ECO4 at install stage'].values[0]
|
|
)
|
|
else:
|
|
ha_eco4_to_sale_rate = median_eco4_to_install
|
|
|
|
eco4_post_ciga_total_results = calculate_eco4_post_ciga(
|
|
eligiblity_counts=eligiblity_counts,
|
|
input_data=input_data,
|
|
ha_ciga_conversion_rate=ha_ciga_conversion_rate,
|
|
ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate,
|
|
ha_eco4_to_sale_rate=ha_eco4_to_sale_rate,
|
|
eco4_rate=eco4_rate,
|
|
archetype_conversion_rate=archetype_conversion_rate
|
|
)
|
|
|
|
eco4_post_ciga_remaining_results = calculate_eco4_post_ciga(
|
|
eligiblity_counts=eligiblity_counts_remaining,
|
|
input_data=input_data,
|
|
ha_ciga_conversion_rate=ha_ciga_conversion_rate,
|
|
ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate,
|
|
ha_eco4_to_sale_rate=ha_eco4_to_sale_rate,
|
|
eco4_rate=eco4_rate,
|
|
archetype_conversion_rate=archetype_conversion_rate
|
|
)
|
|
|
|
# Calculate the delta compared to Warmfront's original remaining
|
|
if original_warmfront_remaining_eco4 == 0:
|
|
eco4_delta_vs_original_estimate_remaining = "N/A"
|
|
else:
|
|
eco4_delta_vs_original_estimate_remaining = ((eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] -
|
|
original_warmfront_remaining_eco4) /
|
|
original_warmfront_remaining_eco4)
|
|
|
|
# GBIS Figures
|
|
# Estimate the GBIS conversion rate
|
|
ha_gbis_sale_conversion = gbis_to_install_clipped[
|
|
gbis_to_install_clipped["Ha Name"] == ha_name
|
|
]
|
|
|
|
if not ha_gbis_sale_conversion.empty:
|
|
ha_gbis_sale_conversion = (
|
|
ha_gbis_sale_conversion["# GBIS successfully installed"].values[0] /
|
|
ha_gbis_sale_conversion["# GBIS at install stage"].values[0]
|
|
)
|
|
else:
|
|
ha_gbis_sale_conversion = median_gbis_to_install
|
|
|
|
gbis_total_pre_cancellations = eligiblity_counts[
|
|
eligiblity_counts["ECO Eligibility"] == "gbis"
|
|
]["count"].sum()
|
|
|
|
gbis_total_pre_cancellations_revenue = gbis_total_pre_cancellations * gbis_rate
|
|
# gbis_total = int(np.round(gbis_total_pre_cancellations * ha_gbis_sale_conversion))
|
|
# gbis_total_revenue = int(gbis_total * gbis_rate)
|
|
|
|
gbis_remaining_pre_cancellations = eligiblity_counts_remaining[
|
|
eligiblity_counts_remaining["ECO Eligibility"] == "gbis"
|
|
]["count"].sum()
|
|
gbis_remaining_pre_cancellations_revenue = (
|
|
gbis_remaining_pre_cancellations * gbis_rate
|
|
)
|
|
# This is the gbis jobs we expect to sell
|
|
gbis_remaining = int(np.round(gbis_remaining_pre_cancellations * ha_gbis_sale_conversion))
|
|
gbis_remaining_revenue = int(gbis_remaining * gbis_rate)
|
|
# This is the number we expect to cancel
|
|
gbis_remaining_expected_cancellations = int(gbis_remaining_pre_cancellations - gbis_remaining)
|
|
gbis_remaining_expected_cancellations_revenue = gbis_remaining_expected_cancellations * gbis_rate
|
|
|
|
# GBIS delta
|
|
if original_warmfront_remaining_gbis == 0:
|
|
gbis_delta_vs_original_estimate_remaining = "N/A"
|
|
else:
|
|
gbis_delta_vs_original_estimate_remaining = (
|
|
(gbis_remaining - original_warmfront_remaining_gbis) / original_warmfront_remaining_gbis
|
|
)
|
|
|
|
# Current sales figures
|
|
# For any sales surveys that are complete, that could still cancel, we apply a conversion rate
|
|
eco4_actually_sold = 0
|
|
eco4_confirmed_cancellations = 0
|
|
eco4_expected_cancellations = 0
|
|
|
|
gbis_actually_sold = 0
|
|
gbis_confirmed_cancellations = 0
|
|
gbis_expected_cancellations = 0
|
|
if not survey_list.empty:
|
|
surveys_with_eligibility = survey_list.merge(
|
|
asset_list[["asset_list_row_id", "ECO Eligibility"]],
|
|
how="left", on="asset_list_row_id"
|
|
)
|
|
completed_eco4_sales = surveys_with_eligibility[
|
|
surveys_with_eligibility["installation_status"] == "ECO4 - installed"
|
|
].shape[0]
|
|
incomplete_eco4_sales = surveys_with_eligibility[
|
|
(surveys_with_eligibility["installation_status"] == "ECO4 - in progress") &
|
|
(~surveys_with_eligibility["ECO Eligibility"].isin(
|
|
["eco4 - passed ciga"])
|
|
)
|
|
].shape[0]
|
|
incomplete_eco4_sales_ciga = surveys_with_eligibility[
|
|
(surveys_with_eligibility["installation_status"] == "ECO4 - in progress") &
|
|
(surveys_with_eligibility["ECO Eligibility"].isin(
|
|
["eco4 - passed ciga"])
|
|
)
|
|
].shape[0]
|
|
|
|
eco4_confirmed_cancellations = surveys_with_eligibility[
|
|
surveys_with_eligibility["installation_status"] == "ECO4 - cancelled"
|
|
].shape[0]
|
|
|
|
expected_eco4_sales_no_ciga = np.round(incomplete_eco4_sales * ha_eco4_to_sale_rate)
|
|
expected_eco4_sales_ciga = np.round(incomplete_eco4_sales_ciga * ha_ciga_pass_to_sale_rate)
|
|
|
|
eco4_expected_cancellations = (incomplete_eco4_sales + incomplete_eco4_sales_ciga) - (
|
|
expected_eco4_sales_no_ciga + expected_eco4_sales_ciga
|
|
)
|
|
eco4_expected_cancellations = int(np.round(eco4_expected_cancellations))
|
|
|
|
eco4_actually_sold = eco4_rate * (
|
|
completed_eco4_sales + expected_eco4_sales_no_ciga + expected_eco4_sales_ciga
|
|
)
|
|
|
|
completed_gbis_sales = surveys_with_eligibility[
|
|
surveys_with_eligibility["installation_status"] == "GBIS - installed"
|
|
].shape[0]
|
|
incomplete_gbis_sales = surveys_with_eligibility[
|
|
(surveys_with_eligibility["installation_status"] == "GBIS - in progress")
|
|
].shape[0]
|
|
|
|
# Get confirmed cancellations
|
|
gbis_confirmed_cancellations = surveys_with_eligibility[
|
|
surveys_with_eligibility["installation_status"] == "GBIS - cancelled"
|
|
].shape[0]
|
|
|
|
expected_gbis_unconfirmed_sales = np.round(incomplete_gbis_sales * ha_gbis_sale_conversion)
|
|
|
|
gbis_expected_cancellations = int(incomplete_gbis_sales - expected_gbis_unconfirmed_sales)
|
|
|
|
gbis_actually_sold = completed_gbis_sales * gbis_rate + (
|
|
expected_gbis_unconfirmed_sales * gbis_rate
|
|
)
|
|
|
|
# Add in the variance:
|
|
# We should expect that the pre-ciga total is:
|
|
# 1) The number of post CIGA successes +
|
|
# 2) The number of archetype failures +
|
|
# 2) the number of CIGA failures +
|
|
# 3) The number of cancellations
|
|
variance_total = eco4_pre_ciga - (
|
|
eco4_post_ciga_total_results["ECO4 - post CIGA - #"] +
|
|
eco4_post_ciga_total_results["Estimated total - failed archetype check - #"] +
|
|
eco4_post_ciga_total_results['Estimated total - failed CIGA'] +
|
|
eco4_post_ciga_total_results["Expected cancellations - #"]
|
|
)
|
|
if variance_total != 0:
|
|
raise ValueError("Something went wrong in variance total")
|
|
|
|
variance_remaining = eco4_pre_ciga_remaining - (
|
|
eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] +
|
|
eco4_post_ciga_remaining_results["Estimated total - failed archetype check - #"] +
|
|
eco4_post_ciga_remaining_results['Estimated total - failed CIGA'] +
|
|
eco4_post_ciga_remaining_results["Expected cancellations - #"]
|
|
)
|
|
|
|
if variance_remaining != 0:
|
|
raise ValueError("Something went wrong in variance remaining")
|
|
|
|
# We also check variances to make sure that the pre-CIGA ECO4 total equals
|
|
# 1) Pre CIGA remaining +
|
|
# 2) ECO4 sold +
|
|
# 3) ECO4 confirmed cancellations +
|
|
# 4) ECO4 unconfirmed cancellations
|
|
|
|
pre_ciga_eco4_variance = (
|
|
eco4_pre_ciga_revenue -
|
|
eco4_pre_ciga_remaining_revenue -
|
|
eco4_actually_sold -
|
|
eco4_confirmed_cancellations * eco4_rate -
|
|
eco4_expected_cancellations * eco4_rate
|
|
)
|
|
|
|
if pre_ciga_eco4_variance != 0:
|
|
raise ValueError("Something went wrong in pre_ciga_eco4_variance")
|
|
|
|
# Check GBIS total variance
|
|
# The total before cancellations should equal:
|
|
# The number of sold +
|
|
# The number of confirmed cancelled +
|
|
# The number of expected cancelled +
|
|
# The number of remaining
|
|
gbis_variance = gbis_total_pre_cancellations - (
|
|
gbis_actually_sold / gbis_rate +
|
|
gbis_confirmed_cancellations +
|
|
gbis_expected_cancellations +
|
|
gbis_remaining_pre_cancellations
|
|
)
|
|
|
|
if gbis_variance != 0:
|
|
raise ValueError("Something went wrong in gbis_variance")
|
|
|
|
# We expect the remaining to equal expected sales + expected cancellations
|
|
gbis_variance_2 = gbis_remaining_pre_cancellations - (
|
|
gbis_remaining +
|
|
gbis_remaining_expected_cancellations
|
|
)
|
|
|
|
if gbis_variance_2 != 0:
|
|
raise ValueError("Something went wrong in gbis_variance2")
|
|
|
|
# Update the GBIS sold, since Warmfront often sold more GBIS that expected
|
|
original_warmfront_gbis_revenue = original_warmfront_sold_gbis + original_warmfront_remaining_gbis_revenue
|
|
original_warmfront_gbis = (
|
|
original_warmfront_sold_gbis / gbis_rate + original_warmfront_remaining_gbis_revenue / gbis_rate
|
|
)
|
|
|
|
to_append = {
|
|
("", "", "", "HA Name"): ha_name,
|
|
# ECO4 - original warmfront figures
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4,
|
|
("ECO4 original", "", "Remaining - #", ""): original_warmfront_remaining_eco4,
|
|
("ECO4 original", "", "Total - £", ""): original_warmfront_eco4_revenue,
|
|
("ECO4 original", "", "Sold or cancelled - £", ""): original_warmfront_sold_eco4,
|
|
("ECO4 original", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue,
|
|
# GBIS - original warmfront figures
|
|
("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis,
|
|
("GBIS original", "", "Remaining - #", ""): original_warmfront_gbis,
|
|
("GBIS original", "", "Total - £", ""): original_warmfront_gbis_revenue,
|
|
("GBIS original", "", "Sold or cancelled - £", ""): original_warmfront_sold_gbis,
|
|
("GBIS original", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue,
|
|
# ECO4 - asset list, pre-ciga
|
|
("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga,
|
|
("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining,
|
|
("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue,
|
|
("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue,
|
|
("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL", ""): pre_ciga_eco4_variance,
|
|
("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL VS ELIGIBLE & INELIGIBLE", ""): variance_total,
|
|
("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 REMAINING VS ELIGIBLE & INELIGIBLE", ""):
|
|
variance_remaining,
|
|
("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold,
|
|
("ECO4 pre-ciga", "", "Confirmed cancellations - £", ""): eco4_confirmed_cancellations * eco4_rate,
|
|
# This is for jobs that are in-progress and could still cancel
|
|
("ECO4 pre-ciga", "", "Unconfirmed cancellations - £", ""): eco4_expected_cancellations * eco4_rate,
|
|
# ECO4 - asset list, post ciga, total
|
|
("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total"):
|
|
eco4_post_ciga_total_results[
|
|
"ECO4 - post CIGA - #"],
|
|
("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_total_results[
|
|
"ECO4 - post CIGA - £"],
|
|
# ECO4 - asset list, post ciga, remaining
|
|
("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining_results[
|
|
"ECO4 - post CIGA - #"],
|
|
("ECO4 post-ciga", "", "Estimated remaining eligible - £", ""): eco4_post_ciga_remaining_results[
|
|
"ECO4 - post CIGA - £"],
|
|
("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %",
|
|
""): eco4_delta_vs_original_estimate_remaining,
|
|
("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - #", ""):
|
|
eco4_post_ciga_remaining_results["Of which confirmed - #"],
|
|
("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - £", ""):
|
|
eco4_post_ciga_remaining_results["Of which confirmed - £"],
|
|
("ECO4 post-ciga", "", "Of which forecast - #", ""):
|
|
eco4_post_ciga_remaining_results["Of which forecast - #"],
|
|
("ECO4 post-ciga", "", "Of which forecast - £", ""):
|
|
eco4_post_ciga_remaining_results["Of which forecast - £"],
|
|
# Expected ECO4 cancellations
|
|
("ECO4 Cancellations", "", "Of which expected cancellations - #", ""): eco4_post_ciga_remaining_results[
|
|
"Expected cancellations - #"
|
|
],
|
|
("ECO4 Cancellations", "", "Of which expected cancellations - £", ""): eco4_post_ciga_remaining_results[
|
|
"Expected cancellations - £"
|
|
],
|
|
# Archetype check failures
|
|
("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - #", ""):
|
|
eco4_post_ciga_remaining_results['Estimated total - failed archetype check - #'],
|
|
("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - £", ""):
|
|
eco4_post_ciga_remaining_results['Estimated total - failed archetype check - £'],
|
|
# CIGA failures
|
|
("ECO4 CIGA failures", "", "Estimated total - failed CIGA - #", ""): eco4_post_ciga_remaining_results[
|
|
'Estimated total - failed CIGA'
|
|
],
|
|
("ECO4 CIGA failures", "", "Estimated total - failed CIGA - £", ""): eco4_post_ciga_remaining_results[
|
|
'Estimated total - failed CIGA - £'
|
|
],
|
|
("ECO4 CIGA failures", "", "Confirmed failures - #", ""): eco4_post_ciga_remaining_results[
|
|
"Confirmed CIGA failures"
|
|
],
|
|
("ECO4 CIGA failures", "", "Confirmed failures - £", ""): eco4_post_ciga_remaining_results[
|
|
"Confirmed CIGA failures - £"
|
|
],
|
|
("ECO4 CIGA failures", "", "Estimated failures - #", ""): eco4_post_ciga_remaining_results[
|
|
"Estimated CIGA failures"
|
|
],
|
|
("ECO4 CIGA failures", "", "Estimated failures - £", ""): eco4_post_ciga_remaining_results[
|
|
"Estimated CIGA failures - £"
|
|
],
|
|
# GBIS postcode list
|
|
("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total_pre_cancellations,
|
|
("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"):
|
|
gbis_total_pre_cancellations_revenue,
|
|
("GBIS Postcode list", "Warmfront post code list", "GBIS VARIANCE", "GBIS total"): gbis_variance,
|
|
("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold,
|
|
("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations * gbis_rate,
|
|
# This is for jobs that are in-progress and could still cancel
|
|
("GBIS Postcode list", "", "Unconfirmed cancellations - £", ""): gbis_expected_cancellations * gbis_rate,
|
|
("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"):
|
|
gbis_remaining_pre_cancellations,
|
|
("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"):
|
|
gbis_remaining_pre_cancellations_revenue,
|
|
("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", ""):
|
|
gbis_delta_vs_original_estimate_remaining,
|
|
# Expected cancellations
|
|
(
|
|
"GBIS Postcode list", "", "Of which expected sales - £ - £",
|
|
"GBIS total"): gbis_remaining_revenue,
|
|
("GBIS Postcode list", "", "Of which expected cancellations -£", "GBIS total"):
|
|
gbis_remaining_expected_cancellations_revenue
|
|
}
|
|
|
|
# Make sure nothing is forgotten due to duplicate multi-index keys
|
|
if len(to_append) != 51:
|
|
raise ValueError("Something went wrong")
|
|
|
|
results.append(to_append)
|
|
|
|
results = pd.DataFrame(results)
|
|
results.to_csv("pipeline_remaining_raw.csv")
|
|
|
|
totals_row = {}
|
|
for col in results.columns:
|
|
if col == ('', '', '', 'HA Name'):
|
|
totals_row[col] = "Total"
|
|
elif col in [
|
|
("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", ""),
|
|
("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "")
|
|
]:
|
|
totals_row[col] = None
|
|
else:
|
|
totals_row[col] = results[col].sum()
|
|
|
|
# For the delta columns, we calculate the delta on the totals
|
|
totals_row[("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", "")] = (
|
|
(
|
|
totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")] -
|
|
totals_row[("ECO4 original", "", "Remaining - #", "")]
|
|
) / totals_row[("ECO4 original", "", "Remaining - #", "")]
|
|
)
|
|
|
|
totals_row[("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "")] = (
|
|
(
|
|
totals_row[("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total")] -
|
|
totals_row[("GBIS original", "", "Remaining - #", "")]
|
|
) / totals_row[("GBIS original", "", "Remaining - #", "")]
|
|
)
|
|
|
|
blank_row = pd.DataFrame([{col: "" for col in results.columns}])
|
|
|
|
# Put together a Warmfront original remaining ECO4 vs asset list remaining ECO4 and same for GBIS, as well as totals
|
|
|
|
# ECO4 Headlines
|
|
headline_eco4_original_remaining = totals_row[("ECO4 original", "", "Remaining - #", "")]
|
|
headline_eco4_original_remaining_revenue = totals_row[("ECO4 original", "", "Remaining - £", "")]
|
|
headline_eco4_postcode_list_remaining = totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")]
|
|
headline_eco4_postcode_list_remaining_revenue = totals_row[
|
|
("ECO4 post-ciga", "", "Estimated remaining eligible - £", "")
|
|
]
|
|
headline_eco4_delta = 100 * (
|
|
(headline_eco4_postcode_list_remaining - headline_eco4_original_remaining) /
|
|
headline_eco4_original_remaining
|
|
)
|
|
headline_eco4_delta = round(headline_eco4_delta, 1)
|
|
|
|
# GBIS Headlines
|
|
headline_gbis_original_remaining = totals_row[("GBIS original", "", "Remaining - #", "")]
|
|
headline_gbis_original_remaining_revenue = totals_row[("GBIS original", "", "Remaining - £", "")]
|
|
headline_gbis_postcode_list_remaining = totals_row[
|
|
("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total")
|
|
]
|
|
headline_gbis_postcode_list_remaining_revenue = totals_row[
|
|
("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total")
|
|
]
|
|
headline_gbis_delta = 100 * (
|
|
(headline_gbis_postcode_list_remaining - headline_gbis_original_remaining) /
|
|
headline_gbis_original_remaining
|
|
)
|
|
headline_gbis_delta = round(headline_gbis_delta, 1)
|
|
|
|
headline_original_total_revenue_remaining = (
|
|
headline_eco4_original_remaining_revenue + headline_gbis_original_remaining_revenue
|
|
)
|
|
|
|
headline_postcode_list_total_revenue_remaining = (
|
|
headline_eco4_postcode_list_remaining_revenue + headline_gbis_postcode_list_remaining_revenue
|
|
)
|
|
headline_total_delta = 100 * (
|
|
(headline_postcode_list_total_revenue_remaining - headline_original_total_revenue_remaining) /
|
|
headline_original_total_revenue_remaining
|
|
)
|
|
headline_total_delta = round(headline_total_delta, 1)
|
|
|
|
headline_eco4_sold_since_november = (
|
|
totals_row[('ECO4 pre-ciga', '', 'Sold - £', '')] +
|
|
totals_row[('ECO4 pre-ciga', '', 'Confirmed cancellations - £', '')] + # confirmed canclleations
|
|
totals_row[('ECO4 pre-ciga', '', 'Unconfirmed cancellations - £', '')] - # expected cancellations
|
|
totals_row[('ECO4 original', '', 'Sold or cancelled - £', '')]
|
|
)
|
|
|
|
headline_gbis_sold_since_november = (
|
|
totals_row[("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total")] +
|
|
totals_row[("GBIS Postcode list", "", "Confirmed cancellations - £", "")] + # confirmed cancellations
|
|
totals_row[("GBIS Postcode list", "", "Unconfirmed cancellations - £", "")] - # expected cancellations
|
|
totals_row[('GBIS original', '', 'Sold or cancelled - £', '')]
|
|
)
|
|
|
|
headlines = [
|
|
{
|
|
("", "", "", "HA Name"): "Headlines",
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "ECO4 Remaining - November - #",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_original_remaining
|
|
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "ECO4 Remaining - November - £",
|
|
(
|
|
"", "Original Warmfront estimate", "Total - #",
|
|
"ECO4 - November"): headline_eco4_original_remaining_revenue
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "ECO4 Sold or cancelled since November - £",
|
|
(
|
|
"", "Original Warmfront estimate", "Total - #",
|
|
"ECO4 - November"): headline_eco4_sold_since_november
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - #",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_postcode_list_remaining
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - £",
|
|
("", "Original Warmfront estimate", "Total - #",
|
|
"ECO4 - November"): headline_eco4_postcode_list_remaining_revenue
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "ECO4 £ remaining delta - %",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_eco4_delta) + "%"
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "GBIS Remaining - November - #",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_original_remaining
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "GBIS Remaining - November - £",
|
|
(
|
|
"", "Original Warmfront estimate", "Total - #",
|
|
"ECO4 - November"): headline_gbis_original_remaining_revenue
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "GBIS Sold or cancelled since November - £",
|
|
(
|
|
"", "Original Warmfront estimate", "Total - #",
|
|
"ECO4 - November"): headline_gbis_sold_since_november
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "GBIS Remaining - post code list - #",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_postcode_list_remaining
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "GBIS Remaining - post code list - £",
|
|
("", "Original Warmfront estimate", "Total - #",
|
|
"ECO4 - November"): headline_gbis_postcode_list_remaining_revenue
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "GBIS delta %",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_gbis_delta) + "%"
|
|
},
|
|
# Total revenue
|
|
{
|
|
("", "", "", "HA Name"): "Total Remaining - November - £",
|
|
("", "Original Warmfront estimate", "Total - #",
|
|
"ECO4 - November"): headline_original_total_revenue_remaining
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "Total Remaining - post code list (post CIGA) - £",
|
|
("", "Original Warmfront estimate", "Total - #",
|
|
"ECO4 - November"): headline_postcode_list_total_revenue_remaining
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "Total Remaining delta %",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_total_delta) + "%"
|
|
},
|
|
]
|
|
|
|
assumptions = [
|
|
{
|
|
("", "", "", "HA Name"): "Assumptions",
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "ECO4 rate",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(eco4_rate)
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "GBIS rate",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(gbis_rate)
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "Median CIGA pass rate",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
|
|
round(median_ciga_success_rate * 100, 1)) + "%",
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "Maximum allowed CIGA pass rate",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
|
|
round(maximum_ciga_conversion * 100, 1)) + "%",
|
|
("ECO4 original", "", "Remaining - #",
|
|
""): "- Maximum allowed CIGA conversion for HAs without CIGA checks We do not allow above this to be "
|
|
"conservative"
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "Median ECO4 (no CIGA) sales conversion rate",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
|
|
round(median_eco4_to_install * 100, 1)) + "%",
|
|
("ECO4 original", "", "Remaining - #",
|
|
""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check. Surveys that resulted "
|
|
"in cancelled install are excluded."
|
|
},
|
|
{
|
|
("", "", "", "HA Name"): "Median ECO4 (subect to CIGA) sales conversion rate",
|
|
("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(
|
|
round(median_ciga_pass_to_install * 100, 1)) + "%",
|
|
("ECO4 original", "", "Remaining - #",
|
|
""): " - Sales conversion rate for a ECO4 property that passed a CIGA check. Surveys that resulted in "
|
|
"cancelled installs are excluded."
|
|
}
|
|
]
|
|
|
|
results = pd.concat(
|
|
[
|
|
results,
|
|
pd.DataFrame([totals_row]),
|
|
blank_row,
|
|
pd.DataFrame(headlines),
|
|
blank_row,
|
|
blank_row,
|
|
pd.DataFrame(assumptions)
|
|
]
|
|
)
|
|
with open("HA Remaining Analysis.csv", "w", newline="") as file:
|
|
# Write the DataFrame data without the index (adjust if you want the index).
|
|
results.to_csv(file, header=True, index=False)
|
|
|
|
|
|
def fml_data_pull(loader):
|
|
has_bruh = [
|
|
"HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13",
|
|
"HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12",
|
|
"HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49",
|
|
'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20",
|
|
]
|
|
|
|
# Can't pull from EPC database because it's based in Scotland
|
|
# "HAXXX", "HAXX"
|
|
# DO
|
|
from backend.SearchEpc import SearchEpc
|
|
epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA="
|
|
|
|
failed_has = []
|
|
for ha in has_bruh:
|
|
print(f"Pulling data for {ha}")
|
|
try:
|
|
asset_list = loader.data[ha]["asset_list"].copy()
|
|
# properties found as eligibile
|
|
fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"]
|
|
|
|
# For each property, search for the latest EPC
|
|
epc_data = []
|
|
for _, row in tqdm(fml.iterrows(), total=fml.shape[0]):
|
|
|
|
property_type, _ = get_property_type_and_built_form(property_meta=row, ha_name=ha)
|
|
|
|
if ha == "HAXXX":
|
|
to_join = [str(x) for x in
|
|
[row["Door Number"], row["Address Line 1"], row["Address Line 2"], row["Address Line 3"],
|
|
row["Postcode"]] if x is not None]
|
|
full_address = ", ".join(to_join)
|
|
else:
|
|
full_address = row["matching_address"]
|
|
|
|
searcher = SearchEpc(
|
|
address1=str(row["HouseNo"]),
|
|
postcode=row["matching_postcode"],
|
|
auth_token=epc_api_key,
|
|
os_api_key="",
|
|
property_type=property_type,
|
|
full_address=full_address,
|
|
fast=True
|
|
)
|
|
# Force the skipping of estimating the EPC
|
|
searcher.ordnance_survey_client.property_type = None
|
|
searcher.ordnance_survey_client.built_form = None
|
|
|
|
searcher.find_property(skip_os=True)
|
|
if searcher.newest_epc is None:
|
|
continue
|
|
|
|
epc = {
|
|
"asset_list_row_id": row["asset_list_row_id"],
|
|
**searcher.newest_epc.copy()
|
|
}
|
|
|
|
epc_data.append(epc)
|
|
|
|
# Remove None entries
|
|
epc_data = [x for x in epc_data if x is not None]
|
|
# Save the data in S3 as a parquet
|
|
epc_data_df = pd.DataFrame(epc_data)
|
|
save_pickle_to_s3(
|
|
data=epc_data_df,
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name=f"ha-analysis/revised/{ha}/epc_data.pickle"
|
|
)
|
|
except Exception as e:
|
|
failed_has.append(ha)
|
|
|
|
|
|
def extract_lower_bound(age_band):
|
|
if pd.isna(age_band):
|
|
return 1930
|
|
try:
|
|
return int(age_band.split(':')[1].split('-')[0].strip())
|
|
except (ValueError, IndexError):
|
|
return 1930
|
|
|
|
|
|
def classify_loft(x):
|
|
# high confidence
|
|
if float(x["roof_insulation_thickness"]) <= 100:
|
|
return "high"
|
|
|
|
if float(x["roof_insulation_thickness"]) <= 200:
|
|
return "medium"
|
|
|
|
if float(x["roof_insulation_thickness"]) <= 270 and x["epc_age"] >= 5 * 365:
|
|
return "medium"
|
|
|
|
return "unlikely"
|
|
|
|
|
|
def fml_analysis(loader):
|
|
# In the case of the optimistic scenario, we assume that the at-risk pipeline is still viable, just at a lower rate
|
|
optimistic_scenario_rate = 1500
|
|
|
|
assumed_ciga_pass_rate = 0.731
|
|
has_bruh = [
|
|
"HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13",
|
|
"HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12",
|
|
"HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49",
|
|
'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20",
|
|
]
|
|
|
|
no_ciga_cavity_descriptions = [
|
|
"Cavity wall, as built, insulated (assumed)",
|
|
"Cavity wall, as built, no insulation (assumed)",
|
|
"Cavity wall, as built, partial insulation (assumed)",
|
|
"Cavity wall, no insulation (assumed)",
|
|
"Cavity wall, partial insulation (assumed)",
|
|
"Cavity wall,",
|
|
"Cavity wall, insulated (assumed)",
|
|
"Cavity wall, no insulation (assumed)",
|
|
"Cavity wall, as built, insulated (assumed)",
|
|
"Cavity wall, partial insulation (assumed)",
|
|
]
|
|
|
|
# TODO: There will be some properties that are subject to CIGA that do not look like they ned a CIGA check! pass
|
|
# them! Non-invasices will have checked the wall though
|
|
|
|
results = []
|
|
wall_descriptions = []
|
|
for ha_name in tqdm(has_bruh):
|
|
|
|
original_figures = loader.december_figures[
|
|
loader.december_figures["HA Name"] == ha_name
|
|
].copy()
|
|
original_remaining = original_figures["ECO4 remaining"].values[0]
|
|
original_gbis_remaining = original_figures["GBIS remaining"].values[0]
|
|
|
|
# Read in the epc data
|
|
asset_list = loader.data[ha_name]["asset_list"].copy()
|
|
# properties found as eligibile
|
|
fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"]
|
|
epc_data = read_pickle_from_s3(
|
|
bucket_name="retrofit-datalake-dev",
|
|
s3_file_name=f"ha-analysis/revised/{ha_name}/epc_data.pickle"
|
|
)
|
|
# We make sure we don't have duplicated. We do a super basic drop duplicates because it shouldn't be a huge
|
|
# issue at this point
|
|
epc_data = epc_data.drop_duplicates("uprn")
|
|
wall_descriptions.extend(epc_data["walls-description"].unique().tolist())
|
|
|
|
# time from the inspection to now
|
|
epc_data["epc_age"] = (datetime.now() - pd.to_datetime(epc_data["inspection-date"])).dt.days
|
|
if "estimated" not in epc_data.columns:
|
|
# For all after HA7, we don't use estimated surveys
|
|
epc_data["estimated"] = False
|
|
|
|
fuck_this = fml.merge(
|
|
epc_data, how="left", on="asset_list_row_id"
|
|
)
|
|
fuck_this["estimated"] = fuck_this["estimated"].fillna(True)
|
|
if fuck_this.shape[0] != fml.shape[0]:
|
|
raise Exception("What the fuck bruv")
|
|
|
|
# Take just remaining
|
|
if not loader.data[ha_name]["survey_list"].empty:
|
|
survey_list = (
|
|
loader.data[ha_name]["survey_list"][
|
|
~pd.isnull(loader.data[ha_name]["survey_list"]["asset_list_row_id"])
|
|
]
|
|
)
|
|
fuck_this = fuck_this.merge(
|
|
survey_list[["asset_list_row_id", "installation_status"]],
|
|
how="left",
|
|
on="asset_list_row_id"
|
|
)
|
|
# Anything that has an installation has gone to installation, and therefore is not remaining
|
|
fuck_this = fuck_this[pd.isnull(fuck_this["installation_status"])]
|
|
fuck_this = fuck_this.drop(columns=["installation_status"])
|
|
|
|
insulation_thicknesses = []
|
|
for _, x in fuck_this.iterrows():
|
|
if pd.isnull(x["roof-description"]):
|
|
continue
|
|
if x["roof-description"] == "SAP05:Roof":
|
|
continue
|
|
|
|
thickness = RoofAttributes(x["roof-description"]).process()["insulation_thickness"]
|
|
# If there is a + in the thickness, strip it out
|
|
thickness = str(thickness).replace("+", "")
|
|
insulation_thicknesses.append(
|
|
{'uprn': x["uprn"], "roof_insulation_thickness": thickness}
|
|
)
|
|
insulation_thicknesses = pd.DataFrame(insulation_thicknesses)
|
|
|
|
before_merge_shape = fuck_this.shape[0]
|
|
fuck_this = fuck_this.merge(insulation_thicknesses, how="left", on="uprn")
|
|
|
|
if fuck_this.shape[0] != before_merge_shape:
|
|
raise Exception("SOMETHING WENT WRONG")
|
|
|
|
# Automated archetype check
|
|
if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")):
|
|
# We perform the archetype test. If the property is a house, we it needs to be detached, semi-detached
|
|
# or end terrace. If it's a bungalow, it must be attached
|
|
fuck_this["passes_archetype"] = None
|
|
fuck_this["passes_archetype"] = np.where(
|
|
(fuck_this["property-type"] == "House") &
|
|
(fuck_this["built-form"].isin(["Semi-Detached", "End-Terrace", "Detached"])),
|
|
True,
|
|
fuck_this["passes_archetype"]
|
|
)
|
|
|
|
fuck_this["passes_archetype"] = np.where(
|
|
(fuck_this["property-type"] == "Bungalow") &
|
|
(fuck_this["built-form"].isin(["Detached"])),
|
|
True,
|
|
fuck_this["passes_archetype"]
|
|
)
|
|
|
|
fuck_this["ECO Eligibility"] = np.where(
|
|
(fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") &
|
|
(fuck_this["passes_archetype"] == True),
|
|
"eco4 (subject to ciga)",
|
|
fuck_this["ECO Eligibility"]
|
|
)
|
|
|
|
# If failed the archetype check and needs a CIGA, it's not eligibile
|
|
fuck_this["ECO Eligibility"] = np.where(
|
|
(fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") &
|
|
(fuck_this["passes_archetype"] != True),
|
|
"not eligible",
|
|
fuck_this["ECO Eligibility"]
|
|
)
|
|
|
|
fuck_this["ECO Eligibility"] = np.where(
|
|
(fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") &
|
|
(fuck_this["passes_archetype"] == True),
|
|
"eco4",
|
|
fuck_this["ECO Eligibility"]
|
|
)
|
|
|
|
fuck_this["ECO Eligibility"] = np.where(
|
|
(fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") &
|
|
(fuck_this["passes_archetype"] != True),
|
|
"gbis",
|
|
fuck_this["ECO Eligibility"]
|
|
)
|
|
|
|
if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")):
|
|
raise Exception("DO THE DAMN ARCHETYPE CHECK BRO")
|
|
|
|
# clean roof insulation
|
|
fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0")
|
|
fuck_this["roof_insulation_thickness"] = fuck_this[
|
|
"roof_insulation_thickness"
|
|
].str.replace("below average", "50")
|
|
fuck_this["roof_insulation_thickness"] = fuck_this[
|
|
"roof_insulation_thickness"
|
|
].str.replace("None", "0")
|
|
fuck_this["roof_insulation_thickness"] = fuck_this[
|
|
"roof_insulation_thickness"
|
|
].str.replace("none", "0")
|
|
fuck_this["roof_insulation_thickness"] = fuck_this[
|
|
"roof_insulation_thickness"
|
|
].str.replace("average", "150")
|
|
fuck_this["roof_insulation_thickness"] = fuck_this[
|
|
"roof_insulation_thickness"
|
|
].str.replace("above 150", "150")
|
|
|
|
fuck_this["roof_classiciation"] = fuck_this.apply(lambda x: classify_loft(x), axis=1)
|
|
|
|
had_survey = fuck_this[fuck_this["estimated"] == False]
|
|
|
|
# proportion with a survey:
|
|
proportion_with_survey = 100 * had_survey.shape[0] / fuck_this.shape[0]
|
|
|
|
# Let's look just at the ECO4 business
|
|
# For things that had a survey, take the properties that didn't need a CIGA check
|
|
no_ciga_check_needed = had_survey[
|
|
had_survey["ECO Eligibility"] == "eco4"
|
|
]
|
|
|
|
no_ciga_check_needed_eligible = no_ciga_check_needed[
|
|
(no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) &
|
|
(no_ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) &
|
|
(no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80)
|
|
]
|
|
|
|
# For anything not needing a CIGA check, some of it will be GBIS
|
|
no_ciga_check_needed_eligible_gbis = no_ciga_check_needed[
|
|
(no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) &
|
|
(no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) &
|
|
(~no_ciga_check_needed["asset_list_row_id"].isin(no_ciga_check_needed_eligible["asset_list_row_id"].values))
|
|
]
|
|
|
|
# Characterise no CIGA check needed
|
|
# !!!!!!!!!!!! AT RISK !!!!!!!!!!!!
|
|
ciga_check_passed = had_survey[had_survey["ECO Eligibility"] == "eco4 - passed ciga"]
|
|
# These should be treated the same as one that have passed their ciga checks, from a detection perspective
|
|
ciga_check_passed_eligible = ciga_check_passed[
|
|
(ciga_check_passed["walls-description"].str.lower().str.contains("cavity") == True) &
|
|
(ciga_check_passed["roof_classiciation"].isin(["high", "medium"])) &
|
|
(ciga_check_passed["current-energy-efficiency"].astype(float) <= 80)
|
|
]
|
|
|
|
if not loader.data[ha_name]["ciga_list"].empty:
|
|
|
|
proportions = loader.data[ha_name]["ciga_list"]["Guarantee"].value_counts(normalize=True)
|
|
ha_ciga_pass_rate = proportions[proportions.index == "No"].values[0]
|
|
|
|
else:
|
|
ha_ciga_pass_rate = assumed_ciga_pass_rate
|
|
|
|
# We take just the cavity walls
|
|
# UCL paper: https://discovery.ucl.ac.uk/id/eprint/10110371/
|
|
# This paper is based on London properties
|
|
# The proportion of EPCs with building characteristics errors are shown to
|
|
# differ between variables; floor and wall type errors occur in ~10-15% of EPCs,
|
|
# compared with ~5% for wall insulation and glazing performance
|
|
|
|
ciga_check_needed = had_survey[
|
|
had_survey["ECO Eligibility"].str.contains("subject to ciga")
|
|
].copy()
|
|
|
|
ciga_check_needed_eligible = ciga_check_needed[
|
|
(ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) &
|
|
(ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) &
|
|
(ciga_check_needed["current-energy-efficiency"].astype(float) <= 80)
|
|
]
|
|
|
|
# Finally, characterise gbis properties. Some of the business might look like ECO4 work, whereas we then
|
|
# qualify what actually looks like gbis
|
|
gbis_identified = had_survey[
|
|
had_survey["ECO Eligibility"] == "gbis"
|
|
].copy()
|
|
|
|
gbis_looks_like_eco4 = gbis_identified[
|
|
(gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) &
|
|
(gbis_identified["roof_classiciation"].isin(["high", "medium"])) &
|
|
(gbis_identified["current-energy-efficiency"].astype(float) <= 80) &
|
|
(
|
|
(
|
|
(gbis_identified["property-type"] == "House") &
|
|
(gbis_identified["built-form"] != "Mid-Terrace")
|
|
) | (
|
|
(gbis_identified["property-type"] == "Bungalow") &
|
|
(gbis_identified["built-form"].isin(["Detached"]))
|
|
)
|
|
)
|
|
]
|
|
|
|
gbis_qualified = gbis_identified[
|
|
(gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) &
|
|
(gbis_identified["current-energy-efficiency"].astype(float) <= 80) &
|
|
(~gbis_identified["asset_list_row_id"].isin(gbis_looks_like_eco4["asset_list_row_id"].values))
|
|
]
|
|
|
|
ciga_check_expectation = np.round(ciga_check_needed_eligible.shape[0] * ha_ciga_pass_rate)
|
|
without_ciga_expectation = no_ciga_check_needed_eligible.shape[0]
|
|
passed_ciga_expectation = ciga_check_passed_eligible.shape[0]
|
|
identified_as_gbis_looks_like_eco4 = gbis_looks_like_eco4.shape[0]
|
|
|
|
# Need to add on the non-ciga
|
|
total_eco4_expectation = (
|
|
ciga_check_expectation +
|
|
without_ciga_expectation +
|
|
passed_ciga_expectation +
|
|
identified_as_gbis_looks_like_eco4
|
|
)
|
|
|
|
# This is the work that is at risk
|
|
eco4_work_at_risk = (
|
|
passed_ciga_expectation +
|
|
ciga_check_expectation
|
|
)
|
|
|
|
no_ciga_check_needed_actually_gbis = no_ciga_check_needed_eligible_gbis.shape[0]
|
|
gbis_qualified = gbis_qualified.shape[0]
|
|
|
|
total_gbis_expectation = no_ciga_check_needed_actually_gbis + gbis_qualified
|
|
|
|
if proportion_with_survey < 100:
|
|
# We estimate the rest
|
|
without_survey_needing_ciga = fuck_this[
|
|
(fuck_this["estimated"] == True) &
|
|
(fuck_this["ECO Eligibility"].str.contains("subject to ciga") == True)
|
|
]
|
|
|
|
if without_survey_needing_ciga.empty:
|
|
without_survey_without_ciga_expected = 0
|
|
else:
|
|
# We apply the same conversion rate as the properties with a survey
|
|
|
|
if ciga_check_needed.shape[0] == 0 and ciga_check_expectation == 0:
|
|
without_survey_without_ciga_expected = without_survey_needing_ciga.shape[0]
|
|
else:
|
|
without_survey_without_ciga_expected = np.round(
|
|
without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0])
|
|
)
|
|
|
|
without_survey_passed_ciga = fuck_this[
|
|
(fuck_this["estimated"] == True) &
|
|
(fuck_this["ECO Eligibility"] == "eco4 - passed ciga")
|
|
]
|
|
|
|
if without_survey_passed_ciga.empty:
|
|
without_survey_passed_ciga_expected = 0
|
|
else:
|
|
# We apply the same conversion rate as the properties with a survey
|
|
without_survey_passed_ciga_expected = np.round(
|
|
without_survey_passed_ciga.shape[0] * (passed_ciga_expectation / ciga_check_passed.shape[0])
|
|
)
|
|
|
|
# Finally, no ciga needed
|
|
without_survey_eco4 = fuck_this[
|
|
(fuck_this["estimated"] == True) &
|
|
(fuck_this["ECO Eligibility"] == "eco4")
|
|
]
|
|
|
|
if without_survey_eco4.empty:
|
|
without_survey_eco4_expected = 0
|
|
without_survey_gbis_expected = 0
|
|
else:
|
|
# We apply the same conversion rate as the properties with a survey
|
|
without_survey_eco4_expected = np.round(
|
|
without_survey_eco4.shape[0] * (without_ciga_expectation / no_ciga_check_needed.shape[0])
|
|
)
|
|
|
|
without_survey_gbis_expected = np.round(
|
|
without_survey_eco4.shape[0] * (total_gbis_expectation / no_ciga_check_needed.shape[0])
|
|
)
|
|
|
|
# And gbis
|
|
without_survey_gbis = fuck_this[
|
|
(fuck_this["estimated"] == True) &
|
|
(fuck_this["ECO Eligibility"] == "gbis")
|
|
]
|
|
|
|
if without_survey_gbis.empty:
|
|
without_survey_identified_as_gbis_qualified = 0
|
|
without_survey_identified_as_gbis_eco4 = 0
|
|
else:
|
|
# We apply the same conversion rate as the properties with a survey
|
|
without_survey_identified_as_gbis_qualified = np.round(
|
|
without_survey_gbis.shape[0] * (gbis_qualified / gbis_identified.shape[0])
|
|
)
|
|
|
|
without_survey_identified_as_gbis_eco4 = np.round(
|
|
without_survey_eco4.shape[0] * (identified_as_gbis_looks_like_eco4 / gbis_identified.shape[0])
|
|
)
|
|
|
|
total_eco4_expectation = (
|
|
total_eco4_expectation +
|
|
without_survey_without_ciga_expected +
|
|
without_survey_passed_ciga_expected +
|
|
without_survey_eco4_expected +
|
|
without_survey_identified_as_gbis_eco4
|
|
)
|
|
|
|
total_gbis_expectation = (
|
|
total_gbis_expectation +
|
|
without_survey_gbis_expected +
|
|
without_survey_identified_as_gbis_qualified
|
|
)
|
|
|
|
results.append(
|
|
{
|
|
"HA Name": ha_name,
|
|
"Original ECO4 Estimate - Remaining": original_remaining,
|
|
"Original GGBIS Estimate - Remaining": original_gbis_remaining,
|
|
# "Postcode List - Remaining": postcode_list_remaining,
|
|
# "Of which sold": sales_since_nov,
|
|
"EPC verified ECO4 Eligible - Remaining": int(total_eco4_expectation),
|
|
"EPC verified GBIS Eligibile - Remaining": int(total_gbis_expectation),
|
|
# At risk work
|
|
"Work at risk due to audits": eco4_work_at_risk
|
|
}
|
|
)
|
|
|
|
results_df = pd.DataFrame(results)
|
|
results_df.to_csv("analysis - revised - audit update.csv")
|
|
|
|
# results_df["Delta vs November"] = 100 * (
|
|
# results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"]
|
|
# ) / results_df["Original ECO4 Estimate - Remaining"]
|
|
|
|
# TODO: Add in estimated GBIS (for eco jobs, of which look like gbis)
|
|
# TODO: Change the left hand side number for our post CIGA estimates
|
|
|
|
|
|
def create_final_report():
|
|
"""
|
|
This function will produce the final output for the HA analysis
|
|
:return:
|
|
"""
|
|
epc_validated_results = pd.read_csv("analysis - revised - audit update.csv")
|
|
pipeline_results = pd.read_csv("pipeline_remaining_raw.csv")
|
|
|
|
####################################
|
|
# Original Warmfront estimates
|
|
####################################
|
|
# Create the volumes result
|
|
all_ha_summary_remaining = pipeline_results[
|
|
[
|
|
"('', '', '', 'HA Name')",
|
|
"('ECO4 original', '', 'Remaining - #', '')",
|
|
"('GBIS original', '', 'Remaining - #', '')",
|
|
]
|
|
].copy().rename(
|
|
columns={
|
|
"('', '', '', 'HA Name')": "HA Name",
|
|
"('ECO4 original', '', 'Remaining - #', '')": "# ECO4 remaining - All HA Summary",
|
|
"('GBIS original', '', 'Remaining - #', '')": "# GBIS remaining - All HA Summary",
|
|
}
|
|
)
|
|
all_ha_summary_remaining["# Total remaining - All HA Summary"] = (
|
|
all_ha_summary_remaining["# ECO4 remaining - All HA Summary"] +
|
|
all_ha_summary_remaining["# GBIS remaining - All HA Summary"]
|
|
)
|
|
all_ha_summary_remaining = all_ha_summary_remaining.sort_values("HA Name")
|
|
|
|
####################################
|
|
# Postcode list - pre-CIGA
|
|
####################################
|
|
postcode_list_pre_ciga_remaining = pipeline_results[
|
|
[
|
|
"('', '', '', 'HA Name')",
|
|
"('ECO4 pre-ciga', '', 'Remaining - #', '')",
|
|
"('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')",
|
|
]
|
|
].copy().rename(
|
|
columns={
|
|
"('', '', '', 'HA Name')": "HA Name",
|
|
"('ECO4 pre-ciga', '', 'Remaining - #', '')": "# ECO4 remaining - Postcode list (pre CIGA)",
|
|
"('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": (
|
|
"# GBIS remaining - Postcode list (pre CIGA)"
|
|
),
|
|
}
|
|
)
|
|
|
|
postcode_list_pre_ciga_remaining["# Total remaining - Postcode list (pre CIGA)"] = (
|
|
postcode_list_pre_ciga_remaining["# ECO4 remaining - Postcode list (pre CIGA)"] +
|
|
postcode_list_pre_ciga_remaining["# GBIS remaining - Postcode list (pre CIGA)"]
|
|
)
|
|
postcode_list_pre_ciga_remaining = postcode_list_pre_ciga_remaining.sort_values("HA Name")
|
|
|
|
####################################
|
|
# Postcode list - post-CIGA
|
|
####################################
|
|
postcode_list_post_ciga_remaining = pipeline_results[
|
|
[
|
|
"('', '', '', 'HA Name')",
|
|
"('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')",
|
|
"('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')",
|
|
]
|
|
].copy().rename(
|
|
columns={
|
|
"('', '', '', 'HA Name')": "HA Name",
|
|
"('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')":
|
|
"# ECO4 remaining - Postcode list (post CIGA)",
|
|
"('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": (
|
|
"# GBIS remaining - Postcode list (post CIGA)"
|
|
),
|
|
}
|
|
)
|
|
|
|
postcode_list_post_ciga_remaining["# Total remaining - Postcode list (post CIGA)"] = (
|
|
postcode_list_post_ciga_remaining["# ECO4 remaining - Postcode list (post CIGA)"] +
|
|
postcode_list_post_ciga_remaining["# GBIS remaining - Postcode list (post CIGA)"]
|
|
)
|
|
postcode_list_post_ciga_remaining = postcode_list_post_ciga_remaining.sort_values("HA Name")
|
|
|
|
####################################
|
|
# From EPC Database
|
|
####################################
|
|
from_epc_database = epc_validated_results[
|
|
[
|
|
"HA Name",
|
|
"EPC verified ECO4 Eligible - Remaining",
|
|
"EPC verified GBIS Eligibile - Remaining",
|
|
"Work at risk due to audits"
|
|
]
|
|
].copy().rename(
|
|
columns={
|
|
"EPC verified ECO4 Eligible - Remaining": "# ECO4 remaining - From EPC Database (post CIGA)",
|
|
"EPC verified GBIS Eligibile - Remaining": "# GBIS remaining - From EPC Database (post CIGA)",
|
|
"Work at risk due to audits": "ECO4 remaining work at risk due to Audits",
|
|
}
|
|
)
|
|
|
|
from_epc_database["# Total remaining - From EPC Database (post CIGA)"] = (
|
|
from_epc_database["# ECO4 remaining - From EPC Database (post CIGA)"] +
|
|
from_epc_database["# GBIS remaining - From EPC Database (post CIGA)"]
|
|
)
|
|
from_epc_database = from_epc_database.sort_values("HA Name")
|
|
|
|
# Combine the datasets
|
|
volumes = all_ha_summary_remaining.merge(
|
|
postcode_list_pre_ciga_remaining, how="left", on="HA Name"
|
|
).merge(
|
|
postcode_list_post_ciga_remaining, how="left", on="HA Name"
|
|
).merge(
|
|
from_epc_database, how="inner", on="HA Name"
|
|
)
|
|
|
|
revenue = volumes.copy()
|
|
# Convert the ECO4 volumes to revenue
|
|
for col in [
|
|
'# ECO4 remaining - All HA Summary',
|
|
'# ECO4 remaining - Postcode list (pre CIGA)',
|
|
'# ECO4 remaining - Postcode list (post CIGA)',
|
|
'# ECO4 remaining - From EPC Database (post CIGA)',
|
|
'ECO4 remaining work at risk due to Audits'
|
|
]:
|
|
revenue[col] = revenue[col] * 1710
|
|
|
|
# Convert the GBIS volumes to revenue
|
|
for col in [
|
|
'# GBIS remaining - All HA Summary',
|
|
'# GBIS remaining - Postcode list (pre CIGA)',
|
|
'# GBIS remaining - Postcode list (post CIGA)',
|
|
'# GBIS remaining - From EPC Database (post CIGA)'
|
|
]:
|
|
revenue[col] = revenue[col] * 600
|
|
|
|
# Re-calculate the totals
|
|
revenue['# Total remaining - All HA Summary'] = (
|
|
revenue['# ECO4 remaining - All HA Summary'] + revenue['# GBIS remaining - All HA Summary']
|
|
)
|
|
|
|
revenue['# Total remaining - Postcode list (pre CIGA)'] = (
|
|
revenue['# ECO4 remaining - Postcode list (pre CIGA)'] + revenue['# GBIS remaining - Postcode list (pre CIGA)']
|
|
)
|
|
|
|
revenue['# Total remaining - Postcode list (post CIGA)'] = (
|
|
revenue['# ECO4 remaining - Postcode list (post CIGA)'] + revenue[
|
|
'# GBIS remaining - Postcode list (post CIGA)']
|
|
)
|
|
|
|
revenue['# Total remaining - From EPC Database (post CIGA)'] = (
|
|
revenue['# ECO4 remaining - From EPC Database (post CIGA)'] +
|
|
revenue['# GBIS remaining - From EPC Database (post CIGA)']
|
|
)
|
|
|
|
# Replace the # with £ in the columns
|
|
revnue_colnames = [col.replace("#", "£") for col in revenue.columns]
|
|
revenue.columns = revnue_colnames
|
|
|
|
# We check that each column gets smaller
|
|
decreasing_check1 = all(
|
|
volumes["# ECO4 remaining - Postcode list (pre CIGA)"] >= volumes[
|
|
'# ECO4 remaining - Postcode list (post CIGA)']
|
|
)
|
|
if not decreasing_check1:
|
|
raise ValueError("decreasing_check1 failed")
|
|
|
|
# Just HA32 and HA17 should fail this, and it's due to GBIS jobs looking like ECO4
|
|
decreasing_check2 = volumes[volumes["# ECO4 remaining - From EPC Database (post CIGA)"] > volumes[
|
|
"# ECO4 remaining - Postcode list (post CIGA)"]]
|
|
|
|
if set(decreasing_check2["HA Name"].tolist()) != {"HA17", "HA32"}:
|
|
raise ValueError("decreasing_check2 failed")
|
|
|
|
# Check for GBIS
|
|
decreasing_check3 = all(
|
|
volumes["# GBIS remaining - Postcode list (pre CIGA)"] >= volumes[
|
|
'# GBIS remaining - Postcode list (post CIGA)']
|
|
)
|
|
|
|
if not decreasing_check3:
|
|
raise ValueError("decreasing_check3 failed")
|
|
|
|
# Don't perform this - this happens for multiple
|
|
# decreasing_check4 = volumes[volumes["# GBIS remaining - From EPC Database (post CIGA)"] > volumes[
|
|
# "# GBIS remaining - Postcode list (post CIGA)"]]
|
|
|
|
# Store final outputs
|
|
volumes.to_csv("HA Analysis - Audit Update - volumes.csv")
|
|
revenue.to_csv("HA Analysis - Audit Update - revenue.csv")
|
|
|
|
|
|
def identify_eco_works(loader):
|
|
# ha_names = [
|
|
# "HA16", # For Housing
|
|
# "HA39", # Rooftop
|
|
# "HA41", # Settle
|
|
# "HA23", # Lambeth
|
|
# "HA14", # EMH
|
|
# "HA7", # Believe
|
|
# "HA102", # Thrive
|
|
# ]
|
|
|
|
# Unitas, fairhive, acis, LHP
|
|
ha_names = [
|
|
"HA50", # Unitas
|
|
"HA15", # Fairhive
|
|
"HA107", # ACIS
|
|
"HA24", # LHP
|
|
]
|
|
names = {
|
|
"HA50": "Unitas",
|
|
"HA15": "Fairhive",
|
|
"HA107": "ACIS",
|
|
"HA24": "LHP"
|
|
}
|
|
|
|
# gbis rate
|
|
breakdowns = []
|
|
# lists = {}
|
|
for ha, data_assets in loader.data.items():
|
|
if ha not in ha_names:
|
|
continue
|
|
|
|
asset_list = data_assets["asset_list"].copy()
|
|
survey_list = data_assets["survey_list"].copy()
|
|
# Remove things that have sold
|
|
if not survey_list.empty:
|
|
asset_list = asset_list.merge(
|
|
survey_list[["asset_list_row_id", "installation_status"]],
|
|
how="left",
|
|
on="asset_list_row_id"
|
|
)
|
|
# Anything that has an installation has gone to installation, and therefore is not remaining
|
|
asset_list = asset_list[pd.isnull(asset_list["installation_status"])]
|
|
asset_list = asset_list.drop(columns=["installation_status"])
|
|
|
|
# Needing a CIGA check
|
|
needs_cga = asset_list[
|
|
asset_list["ECO Eligibility"] == "eco4 (subject to ciga)"
|
|
].copy()
|
|
|
|
eco4 = asset_list[
|
|
asset_list["ECO Eligibility"] == "eco4"
|
|
].copy()
|
|
|
|
eco4_passed_ciga = asset_list[
|
|
asset_list["ECO Eligibility"] == "eco4 - passed ciga"
|
|
].copy()
|
|
|
|
# lists[ha] = {
|
|
# "needs_cga": needs_cga,
|
|
# "eco4": eco4,
|
|
# "eco4_passed_ciga": eco4_passed_ciga
|
|
# }
|
|
|
|
# Store the data
|
|
if not needs_cga.empty:
|
|
needs_cga.to_csv(f"local_data/{names[ha]} - needs ciga.csv")
|
|
|
|
if not eco4.empty:
|
|
eco4.to_csv(f"local_data/{names[ha]} - eco4.csv")
|
|
|
|
if not eco4_passed_ciga.empty:
|
|
eco4_passed_ciga.to_csv(f"local_data/{names[ha]} - eco4 passed ciga.csv")
|
|
|
|
summary = {
|
|
"HA Name": ha,
|
|
"n_needing_ciga": needs_cga.shape[0],
|
|
"eco4": eco4.shape[0],
|
|
"eco4_passed_ciga": eco4_passed_ciga.shape[0]
|
|
}
|
|
|
|
breakdowns.append(summary)
|
|
breakdowns = pd.DataFrame(breakdowns)
|
|
breakdowns = breakdowns.fillna(0)
|
|
|
|
|
|
def unitas_data_prep(loader):
|
|
#####
|
|
# Adhoc - for UNITAS, stripping out additional surveys that have been completed
|
|
unitas_data = loader.data["HA50"].copy()
|
|
unitas_asset_list = unitas_data["asset_list"].copy()
|
|
unitas_survey_sheet = unitas_data["survey_list"].copy()
|
|
|
|
# We remove the surveyed properties from the asset sheet
|
|
unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])]
|
|
unitas_asset_list = unitas_asset_list.merge(
|
|
unitas_survey_sheet[["asset_list_row_id", "installation_status"]],
|
|
how="left",
|
|
on="asset_list_row_id"
|
|
)
|
|
unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])]
|
|
unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"])
|
|
|
|
# We read in the data for the further completed surveys
|
|
unitas_phase_1_workbook = openpyxl.load_workbook(
|
|
"local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx"
|
|
)
|
|
phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"]
|
|
phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"]
|
|
phase1_colnames = [cell.value for cell in phase_1_worksheet[1]]
|
|
phase_1_rows_data = []
|
|
for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False):
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
phase_1_rows_data.append(row_data)
|
|
|
|
phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames)
|
|
|
|
# Correct phase 1 surveys in the same fashion as the previous approach
|
|
phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy())
|
|
|
|
# We check all phase 1 surveys are contained in the data we had before
|
|
additional = []
|
|
for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)):
|
|
# We look for the entry in the old survey sheet:
|
|
# matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]]
|
|
# if matched_uprn.shape[0] == 1:
|
|
# continue
|
|
|
|
matched_1 = unitas_survey_sheet[
|
|
(unitas_survey_sheet["Post Code"] == row["Post Code"]) &
|
|
(unitas_survey_sheet["NO."] == row["NO."])
|
|
]
|
|
|
|
if matched_1.shape[0] == 1:
|
|
continue
|
|
|
|
matched_2 = unitas_survey_sheet[
|
|
(unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) &
|
|
(unitas_survey_sheet["NO."] == row["NO."])
|
|
]
|
|
|
|
if matched_2.shape[0] == 1:
|
|
continue
|
|
|
|
additional.append(row.to_dict())
|
|
additional = pd.DataFrame(additional)
|
|
|
|
phase_2_rows_data = []
|
|
for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False):
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
phase_2_rows_data.append(row_data)
|
|
|
|
phase2_colnames = [cell.value for cell in phase_2_worksheet[1]]
|
|
phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames)
|
|
# Drop all of the occurances of "OFFICE USE ONLY" columns
|
|
phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c])
|
|
common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns})
|
|
additional_filtered = additional[common_columns]
|
|
|
|
further_unitas_completed_surveys = pd.concat(
|
|
[phase_2_surveys, additional_filtered],
|
|
axis=0,
|
|
ignore_index=True
|
|
)
|
|
|
|
# Add a phase 2 key
|
|
further_unitas_completed_surveys["survey_list_row_id"] = [
|
|
"unitas_phase_2" + str(i) for i in further_unitas_completed_surveys.index
|
|
]
|
|
|
|
not_in_asset_list = [
|
|
"unitas_phase_20", "unitas_phase_234", "unitas_phase_2163", "unitas_phase_2173", "unitas_phase_2374"
|
|
]
|
|
|
|
additional_postcodes = ["st28bg"]
|
|
|
|
full_asset_list = unitas_data["asset_list"].copy()
|
|
full_asset_list["matching_postcode"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
|
|
further_unitas_completed_surveys["Post Code"] = further_unitas_completed_surveys["Post Code"].str.replace(
|
|
"ST 5DT", "ST3 5DT"
|
|
)
|
|
|
|
# We match these back to the asset list
|
|
matching_lookup = []
|
|
for _, row in tqdm(further_unitas_completed_surveys.iterrows(), total=len(further_unitas_completed_surveys)):
|
|
|
|
if row["survey_list_row_id"] in not_in_asset_list:
|
|
continue
|
|
|
|
postcode_lower = row["Post Code"].lower().strip().replace(" ", "")
|
|
if postcode_lower in additional_postcodes:
|
|
continue
|
|
|
|
# Confirmed not in asset lsit
|
|
# Filter asset list on postcode
|
|
df = full_asset_list[
|
|
full_asset_list["matching_postcode"].str.contains(postcode_lower)
|
|
]
|
|
|
|
df = df[df["HouseNo"] == str(row["NO."])]
|
|
|
|
if df.shape[0] != 1:
|
|
raise Exception("NOT FOUND")
|
|
|
|
matching_lookup.append(
|
|
{
|
|
"survey_list_row_id": row["survey_list_row_id"],
|
|
"asset_list_row_id": df["asset_list_row_id"].values[0],
|
|
}
|
|
)
|
|
|
|
matching_lookup = pd.DataFrame(matching_lookup)
|
|
matching_lookup["phase_2_surveyed"] = True
|
|
|
|
# We merge this onto the asset list and remove the rows
|
|
unitas_asset_list = unitas_asset_list.merge(
|
|
matching_lookup, how="left", on="asset_list_row_id"
|
|
)
|
|
# Drop rows where phase_2_surveyed is populated
|
|
unitas_asset_list = unitas_asset_list[
|
|
pd.isnull(unitas_asset_list["phase_2_surveyed"])
|
|
]
|
|
|
|
# We add in the new CIGA submissions
|
|
unitas_round_2_ciga_workbook = openpyxl.load_workbook("local_data/ha_data/Unitas second round CIGA checks.xlsx")
|
|
ciga_round_2_worksheet = unitas_round_2_ciga_workbook["Worksheet"]
|
|
ciga_round_2_colnames = [cell.value for cell in ciga_round_2_worksheet[1]]
|
|
round_2_rows_data = []
|
|
for row in ciga_round_2_worksheet.iter_rows(min_row=2, values_only=False):
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
round_2_rows_data.append(row_data)
|
|
|
|
ciga_round_2 = pd.DataFrame(round_2_rows_data, columns=ciga_round_2_colnames)
|
|
# We merge the ciga sheet to the asset list
|
|
ciga_dependent_asset_list = unitas_asset_list[
|
|
unitas_asset_list["ECO Eligibility"].str.contains("subject to ciga")
|
|
].copy()
|
|
|
|
# We merge the ciga sheet to the asset list
|
|
ciga_round_2_matched = ciga_dependent_asset_list.merge(
|
|
ciga_round_2, how="inner", on=["Address Line 1", "Post Code"]
|
|
)
|
|
# Filter on just the properties that had no guarantee
|
|
ciga_round_2_matched = ciga_round_2_matched[ciga_round_2_matched["Guarantee"] == "No"]
|
|
|
|
# ECO Eligibility
|
|
# not eligible 9227
|
|
# failed ciga 2711
|
|
# eco4 (subject to ciga) 2238
|
|
# eco4 - passed ciga 901
|
|
# gbis 114
|
|
# eco4 91
|
|
|
|
# We filter on the properties we're looking to re-survey
|
|
unitas_properties_to_survey = unitas_asset_list[
|
|
unitas_asset_list["ECO Eligibility"].isin(
|
|
[
|
|
"eco4 - passed ciga",
|
|
"eco4"
|
|
]
|
|
)
|
|
].copy()
|
|
|
|
unitas_properties_to_survey = pd.concat(
|
|
[
|
|
unitas_properties_to_survey,
|
|
ciga_round_2_matched[unitas_properties_to_survey.columns]
|
|
]
|
|
)
|
|
|
|
epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA="
|
|
|
|
# We now retrieve the lastest EPC data
|
|
epc_data = []
|
|
for _, unitas_property in tqdm(unitas_properties_to_survey.iterrows(), total=len(unitas_properties_to_survey)):
|
|
property_type, _ = get_property_type_and_built_form(property_meta=unitas_property, ha_name="HA50")
|
|
|
|
full_address = unitas_property["matching_address"]
|
|
|
|
searcher = SearchEpc(
|
|
address1=str(unitas_property["HouseNo"]),
|
|
postcode=unitas_property["matching_postcode"],
|
|
auth_token=epc_api_key,
|
|
os_api_key="",
|
|
property_type=property_type,
|
|
full_address=full_address,
|
|
fast=True
|
|
)
|
|
# Force the skipping of estimating the EPC
|
|
searcher.ordnance_survey_client.property_type = None
|
|
searcher.ordnance_survey_client.built_form = None
|
|
|
|
searcher.find_property(skip_os=True)
|
|
if searcher.newest_epc is None:
|
|
continue
|
|
|
|
epc = {
|
|
"asset_list_row_id": unitas_property["asset_list_row_id"],
|
|
**searcher.newest_epc.copy()
|
|
}
|
|
|
|
epc_data.append(epc)
|
|
|
|
epc_df = pd.DataFrame(epc_data)
|
|
# Pull out just the columns we need
|
|
epc_df = epc_df[
|
|
[
|
|
"asset_list_row_id",
|
|
"address1", "postcode",
|
|
"current-energy-efficiency",
|
|
"current-energy-rating",
|
|
"inspection-date",
|
|
"transaction-type",
|
|
"built-form"
|
|
]
|
|
]
|
|
|
|
epc_df["EPC Rating"] = (
|
|
epc_df["current-energy-efficiency"].astype(str) +
|
|
epc_df["current-energy-rating"].astype(str)
|
|
)
|
|
|
|
# Merge onto the Unitas data:
|
|
unitas_properties_to_survey_full = unitas_properties_to_survey.merge(
|
|
epc_df[
|
|
[
|
|
"asset_list_row_id",
|
|
"EPC Rating",
|
|
"inspection-date",
|
|
"transaction-type",
|
|
"built-form"
|
|
]
|
|
],
|
|
how="left",
|
|
on="asset_list_row_id"
|
|
)
|
|
|
|
unitas_properties_to_survey_full["ECO Eligibility"] = unitas_properties_to_survey_full["ECO Eligibility"].replace(
|
|
"eco4 (subject to ciga)", "eco4 - passed ciga, phase 2 check"
|
|
)
|
|
|
|
for col in ["EPC Rating", "inspection-date", "transaction-type", "built-form"]:
|
|
unitas_properties_to_survey_full[col] = np.where(
|
|
pd.isnull(unitas_properties_to_survey_full[col]),
|
|
"No EPC found",
|
|
unitas_properties_to_survey_full[col]
|
|
)
|
|
unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].fillna(
|
|
"No EPC found"
|
|
)
|
|
unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].astype(str)
|
|
|
|
unitas_properties_to_survey_full = unitas_properties_to_survey_full.rename(
|
|
columns={
|
|
"inspection-date": "Last EPC Inspection Date",
|
|
"transaction-type": "Last EPC Reason",
|
|
"built-form": "Last EPC Built Form",
|
|
}
|
|
)
|
|
|
|
# We now match to the survey outcomes
|
|
unitas_survey_outcomes_workbook = openpyxl.load_workbook(
|
|
"local_data/ha_data/UNITAS - survey outcomes 26.03.2024.xlsx"
|
|
)
|
|
unitas_survey_outcomes_worksheet = unitas_survey_outcomes_workbook["OUTCOMES"]
|
|
unitas_outcomes_colnames = [cell.value for cell in unitas_survey_outcomes_worksheet[2]]
|
|
outcomes_rows_data = []
|
|
for row in unitas_survey_outcomes_worksheet.iter_rows(min_row=3, values_only=False):
|
|
row_data = [cell.value for cell in row] # This will get you the cell values
|
|
outcomes_rows_data.append(row_data)
|
|
|
|
unitas_outcomes = pd.DataFrame(outcomes_rows_data, columns=unitas_outcomes_colnames)
|
|
unitas_outcomes = unitas_outcomes.rename(
|
|
columns={
|
|
"Notes (If 'no answer' under outcomes, have you checked around the property for access "
|
|
"issues where possible?)": "Notes"
|
|
}
|
|
)
|
|
|
|
unitas_outcomes["Postcode"].unique()
|
|
eg1 = unitas_properties_to_survey_full[
|
|
(unitas_properties_to_survey_full["Post Code"] == "ST6 6RF")
|
|
]
|
|
eg1_outcomes = unitas_outcomes[
|
|
(unitas_outcomes["Postcode"] == "ST6 6RF")
|
|
]
|
|
|
|
# Merge outcomes onto properties to survey. Will probably have to do algorithmically
|
|
full_asset_list["matching_postcode_nospace"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
|
|
outcome_matching = []
|
|
for _, outcome in tqdm(unitas_outcomes.iterrows(), total=len(unitas_outcomes)):
|
|
# We search for the corresponding entry in the asset list
|
|
postcode_lower = outcome["Postcode"].lower().strip().replace(" ", "")
|
|
|
|
# Confirmed not in asset lsit
|
|
# Filter asset list on postcode
|
|
df = unitas_properties_to_survey_full[
|
|
unitas_properties_to_survey_full["matching_postcode_nospace"].str.contains(postcode_lower)
|
|
]
|
|
|
|
df = df[df["HouseNo"] == str(outcome["No."])]
|
|
if df.empty:
|
|
continue
|
|
|
|
if df.shape[0] == 1:
|
|
outcome_matching.append(
|
|
{
|
|
"asset_list_row_id": df["asset_list_row_id"].values[0],
|
|
**outcome.to_dict()
|
|
}
|
|
)
|
|
continue
|
|
|
|
raise Exception("something went wrong")
|
|
outcome_matching = pd.DataFrame(outcome_matching)
|
|
|
|
# We can have duplicate matches, so we format the Date letter sent column and retrieve the newest outcome
|
|
outcome_matching["Date letters sent"] = outcome_matching["Date letters sent"].str.lower()
|
|
outcome_matching["Extracted Date"] = outcome_matching["Date letters sent"].str.extract(
|
|
r'(?:w[./]c )(\d{2}\.\d{2}\.\d{4})')
|
|
outcome_matching["Extracted Date"] = pd.to_datetime(outcome_matching["Extracted Date"], format='%d.%m.%Y')
|
|
# We sort by asset_list_row_id and extracted date, and retrieve the newest
|
|
outcome_matching = outcome_matching.sort_values(["asset_list_row_id", "Extracted Date"], ascending=[True, False])
|
|
|
|
# Some properties will have multiple outcomes - for these, we re-format
|
|
outcome_matching_grouped = []
|
|
for asset_list_row_id, grouped_data in outcome_matching.groupby("asset_list_row_id"):
|
|
if grouped_data.shape[0] == 1:
|
|
outcome_matching_grouped.append(
|
|
{
|
|
"Number of previous visits": 1,
|
|
**grouped_data.to_dict("records")[0]
|
|
}
|
|
)
|
|
continue
|
|
if grouped_data.shape[0] == 2:
|
|
newest_visit = grouped_data.head(1)
|
|
oldest_visit = grouped_data.tail(1)[['Outcomes', 'Surveyor', 'Notes', 'Date letters sent']].add_suffix(
|
|
" second visit")
|
|
to_append = {
|
|
"Number of previous visits": 2,
|
|
**newest_visit.to_dict("records")[0],
|
|
**oldest_visit.to_dict("records")[0]
|
|
}
|
|
outcome_matching_grouped.append(to_append)
|
|
else:
|
|
raise Exception("something went wrong")
|
|
|
|
outcome_matching_grouped = pd.DataFrame(outcome_matching_grouped)
|
|
|
|
unitas_properties_to_survey_with_outcomes = unitas_properties_to_survey_full.merge(
|
|
outcome_matching_grouped, how="left", on="asset_list_row_id"
|
|
)
|
|
unitas_properties_to_survey_with_outcomes["Number of previous visits"] = (
|
|
unitas_properties_to_survey_with_outcomes["Number of previous visits"].fillna(0)
|
|
)
|
|
|
|
# Store as an excel
|
|
unitas_properties_to_survey_with_outcomes.to_excel("Unitas - phase 2 properties to Survey.xlsx")
|
|
|
|
unitas_properties_to_survey_with_outcomes["Last EPC Built Form"].value_counts()
|
|
|
|
|
|
def app():
|
|
"""
|
|
This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107.
|
|
Only HA 6 has surveys
|
|
:return:
|
|
"""
|
|
|
|
# Determines if we want to use the cached data in s3
|
|
use_cache = True
|
|
# Determines if we want to perform the data pull
|
|
pull_data = False
|
|
# Override to re-build all inputs
|
|
rebuild_inputs = False
|
|
|
|
# List all of the data in the folder
|
|
directories = [str(file) for entry in DATA_FOLDER.iterdir() if entry.is_dir()
|
|
for file in entry.iterdir() if file.suffix == '.xlsx']
|
|
# Grab the December HA figures filepath
|
|
december_figures_filepath = "local_data/ha_data/HA_December_figures.csv"
|
|
|
|
# Add in:
|
|
priority_has = [
|
|
"HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24",
|
|
"HA25", "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54",
|
|
"HA56", "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42",
|
|
# Added as of March 18th
|
|
"HA44", "HA45", "HA51", "HA52", "HA17", "HA5", "HA20",
|
|
# New HAS
|
|
"HAXX", "HAXXX",
|
|
]
|
|
# Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come
|
|
# back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE],
|
|
# 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE], 31 [DONE], 54 [DONE]
|
|
#
|
|
# Consider for ECO4:
|
|
# HA 70 - have to merge ECO3 list though,
|
|
# HA17 has LOTs of assets, but the asset list is a mess
|
|
# HA53 but has EPCs done
|
|
# Consider for GBIS:
|
|
# Ignore for now:
|
|
# 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in
|
|
# Filter down the directories to only the priority HAs
|
|
directories = [d for d in directories if d.split("/")[2] in priority_has]
|
|
|
|
loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs)
|
|
loader.load()
|
|
loader.ha_facts_and_figures()
|
|
|
|
# import pickle
|
|
# with open("ha_analysis_data_temp.pkl", "wb") as f:
|
|
# pickle.dump(loader, f)
|
|
# import pickle
|
|
# with open("ha_analysis_data_temp.pkl", "rb") as f:
|
|
# loader = pickle.load(f)
|
|
|
|
forecast_remaining_sales(loader)
|
|
|
|
# Functions to produce the final output lol...
|
|
# fml_data_pull(loader) # If we need to pull EPC data
|
|
fml_analysis(loader)
|
|
create_final_report()
|
|
|
|
# Adhoc - for HA16, get the properties that still need a CIGA check
|
|
# asset_list_ha16 = loader.data["HA16"]["asset_list"].copy()
|
|
# ha_16_need_ciga = asset_list_ha16[
|
|
# asset_list_ha16["ECO Eligibility"].str.contains("subject to ciga")
|
|
# ]
|
|
# completed_cigas = loader.data["HA16"]["ciga_list"].copy()
|
|
# # Store the results
|
|
# ha_16_need_ciga.to_csv("ha16_need_ciga.csv")
|
|
# completed_cigas.to_csv("ha16_completed_cigas.csv")
|
|
#
|
|
# # Adhoc - look at the current pipeline and identify how many dormant, CIGA dependent properties there are for
|
|
# # live projects
|
|
#
|
|
# # Read excel
|
|
# orderbook_filepath = "local_data/ha_data/Warmfront HA client order book overview_20240129.xlsx"
|
|
# orderbook_workbook = openpyxl.load_workbook(orderbook_filepath)
|
|
# orderbook_sheet = orderbook_workbook["Contractual Info"]
|
|
# orderbook_colnames = [cell.value for cell in orderbook_sheet[1]]
|
|
#
|
|
# rows = []
|
|
# for row in orderbook_sheet.iter_rows(min_row=2, values_only=False):
|
|
# row_data = [cell.value for cell in row] # This will get you the cell values
|
|
# rows.append(row_data)
|
|
#
|
|
# orderbook = pd.DataFrame(rows, columns=orderbook_colnames)
|
|
# live_orderbook = orderbook[orderbook["Live, New, or Historic?"] == "LIVE"].copy()
|
|
# live_orderbook['Redacted HA'] = live_orderbook['Redacted HA'].str.replace(" ", "")
|
|
#
|
|
# dormant_properties = []
|
|
# missed_has = []
|
|
# for _, customer in live_orderbook.iterrows():
|
|
# if customer['Redacted HA'] not in loader.data.keys():
|
|
# missed_has.append(customer['Redacted HA'])
|
|
# continue
|
|
# asset_list = loader.data[customer['Redacted HA']]["asset_list"].copy()
|
|
# survey_list = loader.data[customer['Redacted HA']]["survey_list"].copy()
|
|
# # Remove sold
|
|
# if not survey_list.empty:
|
|
# survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])]
|
|
# asset_list = asset_list.merge(
|
|
# survey_list[["asset_list_row_id", "installation_status"]],
|
|
# how="left",
|
|
# on="asset_list_row_id"
|
|
# )
|
|
# # Anything that has an installation has gone to installation, and therefore is not remaining
|
|
# asset_list = asset_list[pd.isnull(asset_list["installation_status"])]
|
|
# asset_list = asset_list.drop(columns=["installation_status"])
|
|
#
|
|
# # We pull out the properties that need a CIGA check
|
|
# need_ciga = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to ciga)"]
|
|
# need_archetype = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to archetype)"]
|
|
# need_ciga_and_archetype = asset_list[
|
|
# asset_list["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)"
|
|
# ]
|
|
#
|
|
# dormant_properties.append(
|
|
# {
|
|
# "HA Name": customer['Redacted HA'],
|
|
# "Need CIGA": need_ciga.shape[0],
|
|
# "Need Archetype": need_archetype.shape[0],
|
|
# "Need CIGA and Archetype": need_ciga_and_archetype.shape[0]
|
|
# }
|
|
# )
|
|
#
|
|
# dormant_properties = pd.DataFrame(dormant_properties)
|
|
# totals = dormant_properties.sum()
|
|
# totals["HA Name"] = "Total"
|
|
#
|
|
# dormant_properties = pd.concat([dormant_properties, totals.to_frame().T])
|
|
# dormant_properties.to_csv("dormant_properties.csv")
|
|
#
|
|
# loader.december_figures["ECO4 remaining"].sum()
|
|
# december_figures = loader.december_figures.copy()
|
|
# december_figures["ECO4 remaining"] = np.where(
|
|
# december_figures["ECO4 remaining"] < 0,
|
|
# 0,
|
|
# december_figures["ECO4 remaining"]
|
|
# )
|
|
# december_figures["ECO4 remaining"].sum()
|