Model/etl/eligibility/ha_15_32/app.py
Khalim Conn-Kowlessar 281c6f626c working on eligibility
2024-02-26 23:23:29 +00:00

1146 lines
46 KiB
Python

"""
This process has been created to compare the model based eligibility process against the in-person process
used by the Warmfront team, to identify which properties are eligible for ECO4 and GBIS funding. This
work is being done in December 2023, prior to completion of acquisition
"""
import pickle
from etl.epc.Record import EPCRecord
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import msgpack
from datetime import datetime, timedelta
from utils.logger import setup_logger
from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from backend.Property import Property
from etl.eligibility.Eligibility import Eligibility
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
logger = setup_logger()
load_dotenv(ENV_FILE)
def load_data():
"""
This function loads the asset lists and identified addresses for HA32 and HA15
:return:
"""
# Load the asset list
ha32_asset_list = pd.read_csv("etl/eligibility/ha_15_32/HA32 - ASSET LIST.csv", low_memory=False)
ha15_asset_list = pd.read_csv("etl/eligibility/ha_15_32/HA15 - ASSET LIST.csv", low_memory=False)
# Load the identified addresses
ha32_identified_addresses = pd.read_csv("etl/eligibility/ha_15_32/HA 32 Identified addresses.csv", low_memory=False)
ha15_identified_addresses = pd.read_csv("etl/eligibility/ha_15_32/HA 15 Identified addresses.csv", low_memory=False)
return ha32_asset_list, ha15_asset_list, ha32_identified_addresses, ha15_identified_addresses
def marge_ha_32(asset_list, identified_addresses):
"""
This method merges the asset list onto the list of identified addresses, forming a singular file for ha32
"""
dropped_identified_merge_keys = []
# ha32 starts with 1418 rows
starting_rows = len(asset_list)
# We update how the Coxwold are listed in the identified addresses
identified_addresses["Address"] = np.where(
identified_addresses["Address"] == "Coxwold",
"Coxwold Grove",
identified_addresses["Address"]
)
# Update the Barringhton Avenue with their correct spelling: Barrington Avenue
identified_addresses["Address"] = np.where(
identified_addresses["Address"] == "Barringhton Avenue",
"Barrington Avenue",
identified_addresses["Address"]
)
# Update how the Rustenburn addresses are listed in the identified addresses
identified_addresses["Address"] = np.where(
identified_addresses["Address"] == "Rustenburg",
"Rustenburg Street",
identified_addresses["Address"]
)
# Update how the MALIN LODGE, RONALDSWAY CLOSE addresses are listed in the identified addresses
identified_addresses["Address"] = np.where(
identified_addresses["Address"] == "MALIN LODGE, RONALDSWAY CLOSE",
"Malin Lodge",
identified_addresses["Address"]
)
# Update how the Feroes Close are listed in the identified addresses
identified_addresses["Address"] = np.where(
identified_addresses["Address"] == "Feroes Close",
"Faroes Close",
identified_addresses["Address"]
)
# Update how 7 Norton grove is listed as it has the wrong postcode
asset_list["Postcode"] = np.where(
(asset_list["Street"] == "Norton Grove") & (asset_list["Postcode"] == "HU4 6HQ") & (
asset_list["Dwelling num"] == "7"),
"HU4 6HG",
asset_list["Postcode"]
)
asset_list["merge_key"] = (
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
asset_list["merge_key2"] = (
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
identified_addresses["merge_key"] = (
identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") +
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") +
identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
identified_addresses["merge_key2"] = (
identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") +
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
identified_dupes = identified_addresses["merge_key"].duplicated()
if identified_dupes.sum():
logger.warning("We have %s duplicated identified addresses that will be dropped", identified_dupes.sum())
dropped_identified_merge_keys.extend(identified_addresses[identified_dupes]["merge_key"].tolist())
identified_addresses = identified_addresses.drop_duplicates("merge_key")
# Check asset list for dupes
asset_list_dupes = asset_list["merge_key"].duplicated()
if asset_list_dupes.sum():
logger.warning(
"We have some duplicated asset list rows - they won't be dropped but we make sure they aren't in the "
"identified addresses"
)
dupe_keys = asset_list[asset_list["merge_key"].duplicated()]["merge_key"].tolist()
check = identified_addresses[identified_addresses.merge_key.isin(dupe_keys)]
if not check.empty:
raise ValueError("We have a problem here, investigate me")
# Merge the asset list onto the identified addresses
merged_data = pd.merge(
asset_list,
identified_addresses.drop(columns="merge_key2"),
how="left",
left_on="merge_key",
right_on="merge_key",
suffixes=("", "_identified_addresses")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
merged_data = merged_data.merge(
identified_addresses.drop(columns="merge_key"),
how="left",
left_on="merge_key2",
right_on="merge_key2",
suffixes=("", "_identified_addresses2")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
merged_data["identified"] = (
merged_data["Postcode_identified_addresses"].notnull() | merged_data["Postcode_identified_addresses2"].notnull()
)
# HA 32 issues:
# We have 36 Hesstle road addresses in the HA32 identified addresses, that are not in the original asset list
#
missed = identified_addresses[
~identified_addresses["merge_key"].isin(merged_data["merge_key"]) &
~identified_addresses["merge_key2"].isin(merged_data["merge_key2"])
]
if missed.shape[0] != 36:
raise ValueError("We have a problem here, investigate me, missings beyond the Hessle Road addresses")
# Finally, we return the data we need
return merged_data, dropped_identified_merge_keys
def merge_ha_15(asset_list, identified_addresses):
"""
This method merges the asset list onto the list of identified addresses, forming a singular file
"""
dropped_identified_merge_keys = []
# Update how Mary Mac Manus Drive, Milton Keynes is listed in the identified addresses
identified_addresses["Address"] = identified_addresses["Address"].str.replace(
"Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive"
)
# This address has the wrong postcode in the orignal asset list
asset_list["Postcode"] = np.where(
asset_list["Address Line 1"] == "103 Priory Crescent",
"HP19 9NY",
asset_list["Postcode"]
)
# ha32 starts with 1418 rows
starting_rows = len(asset_list)
asset_list["merge_key"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
asset_list["merge_key2"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 3"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
asset_list["merge_key3"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
asset_list["merge_key4"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 4"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
identified_addresses["merge_key"] = (
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") +
identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
# We check for duplicated identified addresses and in the asset list
identified_dupes = identified_addresses["merge_key"].duplicated()
if identified_dupes.sum():
logger.warning("We have %s duplicated identified addresses that will be dropped", identified_dupes.sum())
dropped_identified_merge_keys.extend(identified_addresses[identified_dupes]["merge_key"].tolist())
identified_addresses = identified_addresses.drop_duplicates("merge_key")
# We pull out raw counts for the survey lists
# Check asset list for dupes
asset_list_dupes = asset_list["merge_key"].duplicated()
if asset_list_dupes.sum():
logger.warning(
"We have some duplicated asset list rows - they won't be dropped but we make sure they aren't in the "
"identified addresses"
)
dupe_keys = asset_list[asset_list["merge_key"].duplicated()]["merge_key"].tolist()
check = identified_addresses[identified_addresses.merge_key.isin(dupe_keys)]
if not check.empty:
raise ValueError("We have a problem here, investigate me")
# Merge the asset list onto the identified addresses
merged_data = pd.merge(
asset_list,
identified_addresses,
how="left",
left_on="merge_key",
right_on="merge_key",
suffixes=("", "_identified_addresses")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
# merge on the second merge key
merged_data = pd.merge(
merged_data,
identified_addresses,
how="left",
left_on="merge_key2",
right_on="merge_key",
suffixes=("", "_identified_addresses2")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
# merge on the third merge key
merged_data = pd.merge(
merged_data,
identified_addresses,
how="left",
left_on="merge_key3",
right_on="merge_key",
suffixes=("", "_identified_addresses3")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
# merge on the fourth merge key
merged_data = pd.merge(
merged_data,
identified_addresses,
how="left",
left_on="merge_key4",
right_on="merge_key",
suffixes=("", "_identified_addresses4")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
merged_data["identified"] = (
merged_data["Postcode_identified_addresses"].notnull() |
merged_data["Postcode_identified_addresses2"].notnull() |
merged_data["Postcode_identified_addresses3"].notnull() |
merged_data["Postcode_identified_addresses4"].notnull()
)
# HA 32 issues:
# We have 36 Hesstle road addresses in the HA32 identified addresses, that are not in the original asset list
#
missed = identified_addresses[
~identified_addresses["merge_key"].isin(merged_data["merge_key"]) &
~identified_addresses["merge_key"].isin(merged_data["merge_key2"]) &
~identified_addresses["merge_key"].isin(merged_data["merge_key3"]) &
~identified_addresses["merge_key"].isin(merged_data["merge_key4"])
]
if missed.shape[0]:
raise ValueError("We have a problem here, investigate me, should not have any missings for ha15")
return merged_data, dropped_identified_merge_keys
def prepare_model_data_row(
property_id, modelling_epc, cleaned, cleaning_data, created_at,
photo_supply_lookup, floor_area_decile_thresholds, old_data=None, full_sap_epc=None,
):
"""
This function prepares the data for modelling, in the same fashion as the recommendation engine
With up-coming refactoring, this will change
:param modelling_epc:
:return:
"""
epc_records = {
'original_epc': modelling_epc.copy(),
'full_sap_epc': full_sap_epc.copy(),
'old_data': old_data.copy(),
}
prepared_epc = EPCRecord(
epc_records=epc_records,
run_mode="newdata",
cleaning_data=cleaning_data
)
p = Property(
id=property_id,
postcode=modelling_epc["postcode"],
address=modelling_epc["address1"],
epc_record=prepared_epc
)
p.get_components(
cleaned, photo_supply_lookup=photo_supply_lookup, floor_area_decile_thresholds=floor_area_decile_thresholds
)
p.create_base_difference_epc_record(cleaned_lookup=cleaned)
cavity_simulation = {
"recommendation_id": "-".join([property_id, "cavity"]),
"type": "cavity_wall_insulation",
"new_u_value": 0.35,
"parts": [{}]
}
loft_simulation = {
"recommendation_id": "-".join([property_id, "loft"]),
"type": "loft_insulation",
"new_u_value": 0.16,
"parts": [{"depth": 270}]
}
simulations = [
cavity_simulation,
loft_simulation
]
recommendation_record = p.base_difference_record.df.to_dict("records")[0].copy()
scoring_dict = p.create_recommendation_scoring_data(
property_id=p.id,
recommendation_record=recommendation_record,
recommendations=simulations,
primary_recommendation_id=cavity_simulation["recommendation_id"]
)
return [scoring_dict]
def get_ha_32data(ha_data, cleaned, cleaning_data, created_at):
house_number_key = "Dwelling num"
address_key = "Street"
postcode_key = "Postcode"
house_name = "Dwelling name"
house_type_key = "Dwelling type"
house_type_lookup = {
"Bungalow": "Bungalow",
"Flat": "Flat",
'House': "House",
'Store Room': None,
'Bungalow Disabled': "Bungalow",
'Flat Disabled': "Flat",
'Dormer Bungalow': "Bungalow",
'Pop-In': None,
'Laundry': None,
'Shed': None,
'Bedsit': None,
}
scoring_data = []
results = []
no_house_numbers = []
for _, house in tqdm(ha_data.iterrows(), total=len(ha_data)):
# If we don't have a house number, we'll continue since we won't realistically be able to find
# an address
if pd.isnull(house[house_number_key]):
no_house_numbers.append(house["row_id"])
continue
if house_name is not None:
if not pd.isnull(house[house_name]):
address1 = " ".join([house[house_name], house[house_number_key], house[address_key]])
else:
address1 = " ".join([house[house_number_key], house[address_key]])
else:
address1 = " ".join([house[house_number_key], house[address_key]])
searcher = SearchEpc(
address1=address1,
postcode=house[postcode_key]
)
response = searcher.search()
if response["status"] == 204:
# If the property is identified, we should fix this
# if house["identified"]:
# raise NotImplementedError("Check if we have an epc")
results.append(
{
"row_id": house["row_id"],
"warmfront_identified": house["identified"],
"gbis_eligible": None,
"eco4_eligible": None,
"sap": None,
"roof": None,
"walls": None,
"date_epc": None,
"message": "No EPC found",
"gbis_eligible_future": None,
"gbis_eligible_future_message": None,
"eco4_eligible_future": None,
"eco4_eligible_future_message": None,
"tenure": None,
"heating_description": None,
}
)
continue
newest_epc, older_epcs, _ = searcher.retrieve(
property_type=house_type_lookup.get(house[house_type_key], None)
)
# We also want to get the penultimate epc
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
if not penultimate_epc:
penultimate_epc = newest_epc
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# If there is no eligibility, we need to check the penultimate epc
# However, we only check the penultimate epc if the property is identified
# This is because if the property was identified, it's possible that the newest EPC is a post-retrofit
# EPC, which would mean that the penultimate EPC is the pre-retrofit EPC
# However, if the property HAS been identified, we don't want to check the penultimate EPC since
# The newest EPC will reflect the current state of the home and therefore we determine if there is a new
# opportunity for retrofit
if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (house["identified"]):
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# If the house is not identified, we do a full gbis and eco4 check
# TODO: Add in ECO4 check
eligibility.check_gbis()
eligibility.check_eco4()
if eligibility.eco4_warmfront["eligible"]:
scoring_dictionary = prepare_model_data_row(
property_id=house["row_id"],
modelling_epc=eligibility.epc,
cleaned=cleaned,
cleaning_data=cleaning_data,
created_at=created_at
)
scoring_data.append(scoring_dictionary)
results.append(
{
"row_id": house["row_id"],
"warmfront_identified": house["identified"],
"gbis_eligible": eligibility.gbis_warmfront,
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
"sap": float(eligibility.epc["current-energy-efficiency"]),
"roof": eligibility.roof["clean_description"],
"walls": eligibility.walls["clean_description"],
"date_epc": eligibility.epc["lodgement-date"],
"message": "eco4 conditional on post sap",
"gbis_eligible_future": eligibility.gbis["eligible"],
"gbis_eligible_future_message": eligibility.gbis["message"],
"eco4_eligible_future": eligibility.eco4["eligible"],
"eco4_eligible_future_message": eligibility.eco4["message"],
"tenure": eligibility.tenure,
"heating_description": eligibility.epc["mainheat-description"],
}
)
continue
# If nothing is eligible or gbis is eligible, then we make a record this
results.append(
{
"row_id": house["row_id"],
"warmfront_identified": house["identified"],
"gbis_eligible": eligibility.gbis_warmfront,
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
"sap": float(eligibility.epc["current-energy-efficiency"]),
"roof": eligibility.roof["clean_description"],
"walls": eligibility.walls["clean_description"],
"date_epc": eligibility.epc["lodgement-date"],
"message": None,
"gbis_eligible_future": eligibility.gbis["eligible"],
"gbis_eligible_future_message": eligibility.gbis["message"],
"eco4_eligible_future": eligibility.eco4["eligible"],
"eco4_eligible_future_message": eligibility.eco4["message"],
"tenure": eligibility.tenure,
"heating_description": eligibility.epc["mainheat-description"],
}
)
return results, scoring_data, no_house_numbers
def get_ha_15data(ha_data, cleaned, cleaning_data, created_at):
house_number_key = None
address_key = "Address Line 1"
postcode_key = "Postcode"
house_name = None
house_type_key = "Property Type"
house_type_lookup = {
"Bungalow": "Bungalow",
"Flat": "Flat",
'House': "House",
'Flat over garage': "Flat",
'Maisonette': "Maisonette",
}
scoring_data = []
results = []
no_house_numbers = []
for _, house in tqdm(ha_data.iterrows(), total=len(ha_data)):
# If we don't have a house number, we'll continue since we won't realistically be able to find
# an address
if house_number_key is not None:
if pd.isnull(house[house_number_key]):
no_house_numbers.append(house["row_id"])
continue
if house_name is not None:
if not pd.isnull(house[house_name]):
address1 = " ".join([house[house_name], house[house_number_key], house[address_key]])
else:
address1 = " ".join([house[house_number_key], house[address_key]])
else:
address1 = house[address_key]
searcher = SearchEpc(
address1=address1,
postcode=house[postcode_key]
)
response = searcher.search()
if response["status"] == 204:
# If the property is identified, we should fix this
# if house["identified"]:
# raise NotImplementedError("Check if we have an epc")
results.append(
{
"row_id": house["row_id"],
"warmfront_identified": house["identified"],
"gbis_eligible": None,
"eco4_eligible": None,
"sap": None,
"roof": None,
"walls": None,
"date_epc": None,
"message": "No EPC found",
"eco4_eligible_future": None,
"eco4_eligible_future_message": None,
"tenure": None,
"heating_description": None,
}
)
continue
newest_epc, older_epcs, _ = searcher.retrieve(
property_type=house_type_lookup.get(house[house_type_key], None)
)
# We also want to get the penultimate epc
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
if not penultimate_epc:
penultimate_epc = newest_epc
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# If there is no eligibility, we need to check the penultimate epc
# However, we only check the penultimate epc if the property is identified
# This is because if the property was identified, it's possible that the newest EPC is a post-retrofit
# EPC, which would mean that the penultimate EPC is the pre-retrofit EPC
# However, if the property HAS been identified, we don't want to check the penultimate EPC since
# The newest EPC will reflect the current state of the home and therefore we determine if there is a new
# opportunity for retrofit
if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (house["identified"]):
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# If the house is not identified, we do a full gbis and eco4 check
# TODO: Add in ECO4 check
eligibility.check_gbis()
eligibility.check_eco4()
if eligibility.eco4_warmfront["eligible"]:
scoring_dictionary = prepare_model_data_row(
property_id=house["row_id"],
modelling_epc=eligibility.epc,
cleaned=cleaned,
cleaning_data=cleaning_data,
created_at=created_at
)
scoring_data.append(scoring_dictionary)
results.append(
{
"row_id": house["row_id"],
"warmfront_identified": house["identified"],
"gbis_eligible": eligibility.gbis_warmfront,
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
"sap": float(eligibility.epc["current-energy-efficiency"]),
"roof": eligibility.roof["clean_description"],
"walls": eligibility.walls["clean_description"],
"date_epc": eligibility.epc["lodgement-date"],
"message": "eco4 conditional on post sap",
"gbis_eligible_future": eligibility.gbis["eligible"],
"gbis_eligible_future_message": eligibility.gbis["message"],
"eco4_eligible_future": eligibility.eco4["eligible"],
"eco4_eligible_future_message": eligibility.eco4["message"],
"tenure": eligibility.tenure,
"heating_description": eligibility.epc["mainheat-description"],
}
)
continue
# If nothing is eligible or gbis is eligible, then we make a record this
results.append(
{
"row_id": house["row_id"],
"warmfront_identified": house["identified"],
"gbis_eligible": eligibility.gbis_warmfront,
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
"sap": float(eligibility.epc["current-energy-efficiency"]),
"roof": eligibility.roof["clean_description"],
"walls": eligibility.walls["clean_description"],
"date_epc": eligibility.epc["lodgement-date"],
"message": None,
"gbis_eligible_future": eligibility.gbis["eligible"],
"gbis_eligible_future_message": eligibility.gbis["message"],
"eco4_eligible_future": eligibility.eco4["eligible"],
"eco4_eligible_future_message": eligibility.eco4["message"],
"tenure": eligibility.tenure,
"heating_description": eligibility.epc["mainheat-description"],
}
)
# with open("ha_15_outputs.pickle", "rb") as f:
# results_dict = pickle.load(f)
# results = results_dict["results"]
# scoring_data = results_dict["scoring_data"]
# no_house_numbers = results_dict["no_house_numbers"]
scoring_df = pd.DataFrame(scoring_data)
# Implement the same process that is being used in the recommendation engine to cleaning scoring_df
# Perform the same cleaning as in the model - first clean number of room variables though
scoring_df = DataProcessor.apply_averages_cleaning(
data_to_clean=scoring_df,
cleaning_data=cleaning_data,
cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
)
scoring_df = DataProcessor.apply_averages_cleaning(
data_to_clean=scoring_df,
cleaning_data=cleaning_data,
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
).drop(columns=["LOCAL_AUTHORITY"])
scoring_df = DataProcessor.clean_missings_after_description_process(
scoring_df,
ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
"insulation_thickness" in c) or ("ENERGY_EFF" in c)]
)
scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
model_api = ModelApi(portfolio_id="ha32-eligibility", timestamp=created_at)
all_predictions = model_api.predict_all(
df=scoring_df,
bucket="retrofit-data-dev",
prediction_buckets={
"sap_change_predictions": "retrofit-sap-predictions-dev",
"heat_demand_predictions": "retrofit-heat-predictions-dev",
"carbon_change_predictions": "retrofit-carbon-predictions-dev"
}
)
# merge the predictions onto the scoring_df
predictions = all_predictions["sap_change_predictions"]
results_df = pd.DataFrame(results)
results_df = results_df.merge(
predictions[["predictions", "property_id"]].rename(
columns={"predictions": "post_install_sap", "property_id": "row_id"}
),
how="left",
on="row_id"
)
# Our methodology for identifying properties is to use the post-install SAP score
# We produce the following classifications, which accomodate the fact that the model can be wrong
# 1) If the post-install SAP score is above 71, we say the property is eligible and we hve high confidence
# 2) If the post-install SAP score is above 69, we say that the property is eligible
# 3) If the post-install SAP score is above 67, we say that the property is eligible, but we are not confident
# 4) If the post-install SAP score is below 67, we say that the property is unlikely to be eligible
eligibility_assessment = []
for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
# The upgrade requirements are dependent on the current SAP
# If the property is an F or G, it only needs to upgrade to an %
if row["sap"] <= 38:
if row["post_install_sap"] >= 57:
eligibility_classification = "highest confidence"
elif row["post_install_sap"] >= 55:
eligibility_classification = "high confidence"
elif row["post_install_sap"] >= 53:
eligibility_classification = "medium confidence"
else:
eligibility_classification = "unlikely"
else:
if row["post_install_sap"] >= 71:
eligibility_classification = "highest confidence"
elif row["post_install_sap"] >= 69:
eligibility_classification = "high confidence"
elif row["post_install_sap"] >= 67:
eligibility_classification = "medium confidence"
else:
eligibility_classification = "unlikely"
eligibility_assessment.append(
{
"row_id": row["row_id"],
"eligibility_classification": eligibility_classification
}
)
eligibility_assessment = pd.DataFrame(eligibility_assessment)
results_df = results_df.merge(
eligibility_assessment, how="left", on="row_id"
)
return results_df, scoring_df, no_house_numbers
def analyse_ha_32_results(results, ha32, no_house_numbers):
"""
We want to know:
1) What proportion of identified properties we get correct
2) If we miss identified properties, why
3) Which properties do we identify that were not identified by warmfront. What is our confidence on these?
For HA32, most of these (if not all) properties were identified under gbis
"""
results_df = pd.DataFrame(results)
results_df["tenure"] = results_df["tenure"].fillna("Unknown - probably new build")
# What proportio
warmfront_identified = results_df[
results_df["warmfront_identified"]
]
# Aggregates of no eco and gbis jobs identified
n_eco = results_df["eco4_eligible"].sum()
# Gbis is rows where eco4 is not eligible
n_gbis = results_df[
(results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False)
]["gbis_eligible"].sum()
pipeline_potential = results_df[
(results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | (
results_df["gbis_eligible"] == True)
]
success_rate = warmfront_identified["gbis_eligible"].sum() / warmfront_identified.shape[0]
# For HA32, this is 89%
missed = results_df[
results_df["warmfront_identified"] & (warmfront_identified["gbis_eligible"] != True)
]
sap_too_high = missed[
missed["sap"] >= 69
]
sap_low_enough = missed[
missed["sap"] < 69
]
investigate_1 = ha32[ha32["row_id"].isin(sap_too_high["row_id"])][
["row_id", "Postcode", "Address", "Dwelling num", "Street"]]
investigate_2 = ha32[ha32["row_id"].isin(sap_low_enough["row_id"])][
["row_id", "Postcode", "Address", "Dwelling num", "Street"]]
# to_check = missed[pd.isnull(missed["message"])]
# ha32[ha32["row_id"] == to_check["row_id"].values[14]].squeeze()
# to_check[to_check["row_id"] == to_check["row_id"].values[14]].squeeze()
# For these properties, warmfront identified all of them, however two did not seem to look valid.
# We could perhaps update our detection, if the properties not found are not currently EPC C or above, but
# do not look eligible from a building materials perspective
# E.g.:
# row_ids = ha32[ha32["Postcode"] == "HU4 6TG"]["row_id"].values
# z = results_df[results_df["row_id"].isin(row_ids)]
# Reason 1: The EPC indicates that the cavity is filled (GBIS allows for more than just cavity measures, however
# we check ust the cavity for GBIS homes, since I believe this is what Warmfront have in place with
# regards to commercial agreements with the installer. An example of this is 30 Coxwold Grove,
# HU4 6HH.
#
# Reason 2: Some properties do not have any existing data. This amounts for 16 of the 50 that we missed.
# We will be implemntating a solution to interpolate homes that do not have any data, based on their
# neighbours. An example of this is 979 Hessle Road, HU4 6QG. If we look at the neighbours, we would
# likely infer that this property has an empty cavity and therefore would identify
#
# Reason 3: Some properties, e.g. 975 Hessle Road, HU4 6QG, look like they would quality for GBIS,
# but is already a C, based on its Nov 2022 EPC (it was a C before that too). I'm personally not sure
# why this home would get identified as you would not be able to get GBIS funding. Same for 977 Hessle
# road. This was the most common reason. Another example 8 Edith Cavell Court, HU5 4BA
#
# Reason 4: Some properties are a combination of reason 1 and 3. This could be to do with inaccurate EPCs as
# emperically speaking, when going through this manually, it seemed like the ones that fall into this
# category had slightly older EPCs (pre-2019). There are a few like this but e.g
# 3, Summergroves Way HU4 6SZ
# We now look for properties that we identified, that were not identified by Warmfront
new_possibilities = results_df[
(~results_df["warmfront_identified"]) &
(results_df["gbis_eligible"] | results_df["eco4_eligible"])
].copy()
new_possibilities_eco = results_df[
(~results_df["warmfront_identified"]) &
(results_df["eco4_eligible"] == True)
].copy()
new_possibilities_gbis = results_df[
(~results_df["warmfront_identified"]) &
(results_df["eco4_eligible"] == False) & (results_df["gbis_eligible"] == True)
].copy()
future_possibilities_eco = results_df[
(~results_df["warmfront_identified"]) &
(results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
].copy()
future_possibilities_eco["eco4_eligible_future_message"].value_counts()
future_possibilities_gbis = results_df[
(~results_df["warmfront_identified"]) &
(results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & (
~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
].copy()
future_possibilities_gbis["gbis_eligible_future_message"].value_counts()
# We deem that Any EPC that is produced in the last 3 years gives us good confidence
cutoff_date = datetime.now() - timedelta(days=3 * 365)
new_possibilities["high_confidence"] = pd.to_datetime(new_possibilities["date_epc"]) >= cutoff_date
future_possibilities_eco["high_confidence"] = pd.to_datetime(
future_possibilities_eco["date_epc"]) >= cutoff_date
# We do a quick check on properties that didn't have a house number:
no_house_numbers_ha32 = ha32[ha32["row_id"].isin(no_house_numbers)]["identified"].sum()
if no_house_numbers_ha32:
logger.error("We have some identified properties that have no house numbers - investigate me")
new = {
"n_new_possibilities": new_possibilities.shape[0],
"new_possibilities_confidence": new_possibilities["high_confidence"].value_counts(),
"future_possibilities_gbis": future_possibilities_gbis.shape[0],
"future_possibilities_gbis_confidence": future_possibilities_gbis["high_confidence"].value_counts(),
"future_possibilities_eco": future_possibilities_eco.shape[0],
"future_possibilities_eco_confidence": future_possibilities_eco["high_confidence"].value_counts(),
}
return success_rate, new
def analyse_ha_15_results(results_df, ha15, no_house_numbers):
"""
We want to know:
1) What proportion of identified properties we get correct
2) If we miss identified properties, why
3) Which properties do we identify that were not identified by warmfront. What is our confidence on these?
For HA32, most of these (if not all) properties were identified under gbis
"""
results_df["tenure"] = results_df["tenure"].fillna("Unknown - probably new build")
# What proportio
warmfront_identified = results_df[
results_df["warmfront_identified"]
]
warmfront_identified = warmfront_identified
n_identified = (warmfront_identified["gbis_eligible"] | warmfront_identified["eco4_eligible"]).sum()
success_rate = n_identified / warmfront_identified.shape[0]
eco_identified_confidence = warmfront_identified[warmfront_identified["eco4_eligible"] == True][
"eligibility_classification"].value_counts()
# For HA15 this is 50.3%
pipeline_potential = results_df[
(results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | (
results_df["gbis_eligible"] == True)
]
# of the properties we identify, what is the mix of confidenc
missed = results_df[
results_df["warmfront_identified"] & (
(warmfront_identified["gbis_eligible"] != True) & (warmfront_identified["eco4_eligible"] != True)
)
]
missed_no_data = missed[missed["message"] == "No EPC found"].shape[0]
sap_too_high = missed[
missed["sap"] >= 69
]
sap_low_enough = missed[
missed["sap"] < 69
]
# Aggregates of no eco and gbis jobs identified
n_eco = results_df["eco4_eligible"].sum()
# Gbis is rows where eco4 is not eligible
n_gbis = results_df[
(results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False)
]["gbis_eligible"].sum()
# We now look for properties that we identified, that were not identified by Warmfront
new_possibilities = results_df[
(~results_df["warmfront_identified"]) &
((results_df["gbis_eligible"] == True) | (results_df["eco4_eligible"] == True))
].copy()
new_possibilities_eco = results_df[
(~results_df["warmfront_identified"]) &
(results_df["eco4_eligible"] == True)
].copy()
new_possibilities_gbis = results_df[
(~results_df["warmfront_identified"]) &
(results_df["eco4_eligible"] == False) & (results_df["gbis_eligible"] == True)
].copy()
# These are future possibilityies
future_possibilities_eco = results_df[
(~results_df["warmfront_identified"]) &
(results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
].copy()
future_possibilities_gbis = results_df[
(~results_df["warmfront_identified"]) &
(results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & (
~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
].copy()
# We deem that Any EPC that is produced in the last 3 years gives us good confidence for GBIS
cutoff_date = datetime.now() - timedelta(days=3 * 365)
new_possibilities["high_confidence"] = pd.to_datetime(new_possibilities["date_epc"]) >= cutoff_date
eco_new_possibilities = new_possibilities["eco4_eligible"].sum()
eco_new_possibilities_confidence = new_possibilities[
new_possibilities["eco4_eligible"]
]["eligibility_classification"].value_counts()
gbis_new_possibilites = new_possibilities["gbis_eligible"].sum()
gbis_new_possibilites_confidence = new_possibilities[
new_possibilities["gbis_eligible"]
]["high_confidence"].value_counts()
new = {
"new_possibilities": new_possibilities,
"eco_new_possibilities": eco_new_possibilities,
"eco_new_possibilities_confidence": eco_new_possibilities_confidence,
"gbis_new_possibilites": gbis_new_possibilites,
"gbis_new_possibilites_confidence": gbis_new_possibilites_confidence
}
identified_results = {
"n_identified": n_identified,
"success_rate": success_rate,
"eco_identified_confidence": eco_identified_confidence
}
missed_results = {
"n_missed": missed.shape[0],
"n_sap_too_high": sap_too_high.shape[0],
"n_sap_low_enough": sap_low_enough.shape[0],
"missed_no_data": missed_no_data
}
return success_rate, new, identified_results, missed_results
def app():
ha32_asset_list, ha15_asset_list, ha32_identified_addresses, ha15_identified_addresses = load_data()
ha32, _ = marge_ha_32(asset_list=ha32_asset_list, identified_addresses=ha32_identified_addresses)
ha15, _ = merge_ha_15(asset_list=ha15_asset_list, identified_addresses=ha15_identified_addresses)
ha32["row_id"] = ["h32" + str(i) for i in range(0, len(ha32))]
ha15["row_id"] = ["h15" + str(i) for i in range(0, len(ha15))]
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
cleaning_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
)
created_at = datetime.now().isoformat()
# We want to retrieve EPCs for every single property
# NOTE: HA32 is MOSTLY cavity via GBIS
ha32_results, ha32_scoring_data, ha32_no_house_numbers = get_ha_32data(
ha_data=ha32,
cleaned=cleaned,
cleaning_data=cleaning_data,
created_at=created_at
)
# with open("ha32.pickle", "wb") as f:
# pickle.dump(
# {
# "ha32_results": ha32_results,
# "ha32_scoring_data": ha32_scoring_data,
# "ha32_no_house_numbers": ha32_no_house_numbers
# },
# f
# )
# with open("ha32.pickle", "rb") as f:
# ha32_dict = pickle.load(f)
#
# ha32_results = ha32_dict["ha32_results"]
# ha32_scoring_data = ha32_dict["ha32_scoring_data"]
# ha32_no_house_numbers = ha32_dict["ha32_no_house_numbers"]
ha32_success_rate, ha32_new_possibilities = analyse_ha_32_results(
results=ha32_results, ha32=ha32, no_house_numbers=ha32_no_house_numbers
)
# HA 15
ha15_results_df, ha15_scoring_df, ha15_no_house_numbers = get_ha_15data(ha15, cleaned, cleaning_data, created_at)
# with open("ha15.pickle", "wb") as f:
# pickle.dump(
# {
# "ha15_results_df": ha15_results_df,
# "ha15_scoring_df": ha15_scoring_df,
# "ha15_no_house_numbers": ha15_no_house_numbers
# },
# f
# )
# with open("ha15.pickle", "rb") as f:
# ha15_dict = pickle.load(f)
#
# ha15_results_df = ha15_dict["ha15_results_df"]
# ha15_scoring_df = ha15_dict["ha15_scoring_df"]
# ha15_no_house_numbers = ha15_dict["ha15_no_house_numbers"]
ha15_success_rate, ha15_new, ha15_identified_results, ha15_missed_results = analyse_ha_15_results(
results_df=ha15_results_df,
ha15=ha15,
no_house_numbers=ha15_no_house_numbers
)