Model/etl/eligibility/ha_15_32/ha25_app.py
2024-01-16 11:10:56 +00:00

883 lines
37 KiB
Python

import os
import msgpack
import openpyxl
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
from utils.s3 import read_from_s3
from utils.logger import setup_logger
from dotenv import load_dotenv
from utils.s3 import read_dataframe_from_s3_parquet
from tqdm import tqdm
from backend.SearchEpc import SearchEpc
from etl.eligibility.Eligibility import Eligibility
from etl.eligibility.ha_15_32.app import prepare_model_data_row
from etl.epc.DataProcessor import DataProcessor
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from recommendations.recommendation_utils import calculate_cavity_age
from recommendation_utils import convert_thickness_to_numeric
import re
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
logger = setup_logger()
load_dotenv(ENV_FILE)
def load_data():
workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx', data_only=True)
sheet = workbook.active
rows_data = []
rows_colors = []
for row in sheet.iter_rows(min_row=1, values_only=True): # use values_only=True to get values
row_data = list(row) # No need for comprehension, values_only=True returns a tuple of values
rows_data.append(row_data)
# Headers are on the final row. Pop them off and store them and then remove them from rows_data
headers = rows_data.pop()
# The postcode header is None, so we replace it with "postcode"
headers[-1] = "postcode"
# Handle colours separately
for row in sheet.iter_rows(min_row=1, values_only=False):
# Assume first cell color is indicative of entire row
row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
rows_colors.append(row_color)
# Remove the final row of colours, which is the header
rows_colors.pop()
asset_list = pd.DataFrame(rows_data, columns=headers)
asset_list['row_color'] = rows_colors
asset_list["row_colour_name"] = np.where(
asset_list["row_color"] == "FFFF0000", "red",
np.where(asset_list["row_color"] == "FF00B050", "green", "yellow")
)
asset_list["row_colour_code"] = np.where(
asset_list["row_colour_name"] == "red", "does not meet criteria",
np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
)
asset_list["address"] = asset_list["T1_Address"].copy().str.lower()
asset_list["address"] = asset_list["address"].str.replace("flat", "")
asset_list["address"] = asset_list["address"].str.strip()
split_addresses = asset_list['address'].str.split(' ', expand=True)
split_addresses.columns = ['HouseNo', 'address2', 'address3', 'address4', 'address5', 'address6', 'address7',
'address8',
'address9', 'address10', 'address11', 'address12', 'address13', 'address14', ]
split_addresses["HouseNo"] = split_addresses["HouseNo"].str.replace(";", "")
# We could re-concatenate but we only care about HouseNo for the moment
asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1)
asset_list["postcode"] = asset_list["postcode"].str.strip()
# We analysis historical ECO3 survey list
eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
eco3_survey_sheet = eco3_survey_workbook["CAVITY"]
eco3_survey_rows = []
eco3_survey_colors = []
for row in eco3_survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
# row_color = COLOR_INDEX[row_color]
eco3_survey_rows.append(row_data)
eco3_survey_colors.append(row_color)
# Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically
eco3_survey_list = pd.DataFrame(eco3_survey_rows, columns=[cell.value for cell in eco3_survey_sheet[1]])
eco3_survey_list["row_colour"] = eco3_survey_colors
# Remove rows where street name is missing
eco3_survey_list = eco3_survey_list[~pd.isnull(eco3_survey_list["Street / Block Name"])]
# We need to parse the row colours
# We have the following mappings:
# FF7030A0: purple
# FF92D050: green
# FFFF0000: red
# FFFFFF00: yellow
# FF38FD23: green
eco3_survey_list["row_colour_name"] = np.where(
eco3_survey_list["row_colour"] == "FF7030A0", "purple",
np.where(eco3_survey_list["row_colour"] == "FF92D050", "green",
np.where(eco3_survey_list["row_colour"] == "FFFF0000", "red",
np.where(eco3_survey_list["row_colour"] == "FFFFFF00", "yellow",
np.where(eco3_survey_list["row_colour"] == "FF38FD23", "green", "unknown")
)
)
)
)
# We map the meaning:
# red: cancelled
# green: installed advised install complete
# purple: installer advised install complete + post works EPC
# yellow: filler row - drop
eco3_survey_list["row_colour_code"] = np.where(
eco3_survey_list["row_colour_name"] == "red", "cancelled",
np.where(eco3_survey_list["row_colour_name"] == "green", "installed advised install complete",
np.where(eco3_survey_list["row_colour_name"] == "purple",
"installer advised install complete + post works EPC",
np.where(eco3_survey_list["row_colour_name"] == "yellow", "filler row - drop", "unknown")
)
)
)
# This is good enough for the indicative cancellation rates
# We now read in the indicative survey list which identified pospects for ECO4 works
eco4_survey_workbook = openpyxl.load_workbook(
f'etl/eligibility/ha_15_32/HESTIA - HA 25 ADHOC ISOLATED IDENTIFIED PROPERTIES FOR CWI.xlsx'
)
eco4_prospect_survey_sheet = eco4_survey_workbook["LiveWest"]
eco4_prospects_survey_rows = []
eco4_prospects_survey_colors = []
for row in eco4_prospect_survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
# row_color = COLOR_INDEX[row_color]
eco4_prospects_survey_rows.append(row_data)
eco4_prospects_survey_colors.append(row_color)
# Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically
eco4_prospects_survey_list = pd.DataFrame(
eco4_prospects_survey_rows, columns=[cell.value for cell in eco4_prospect_survey_sheet[1]]
)
eco4_prospects_survey_list["row_colour"] = eco4_prospects_survey_colors
eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.lower()
eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.strip()
eco4_prospects_survey_list = eco4_prospects_survey_list[~pd.isnull(eco4_prospects_survey_list["ADDRESS 1"])]
eco4_prospects_survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(eco4_prospects_survey_list))]
# Correct some errors in the survey list
eco4_prospects_survey_list["POSTCODE"] = np.where(
(eco4_prospects_survey_list["ADDRESS 1"] == "berry park") &
(eco4_prospects_survey_list["POSTCODE"] == "PL12 6HP"),
"PL12 6EN",
eco4_prospects_survey_list["POSTCODE"]
)
# Remove semi colons from address in asset and survey list
asset_list["T1_Address"] = asset_list["T1_Address"].str.replace(";", "")
eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(";", "")
# In the prosepcts survey list, we have 6 WALKHAM MEADOWS listed twice, which should be 6a and 6b
eco4_prospects_survey_list.loc[838, "NO"] = "6a"
eco4_prospects_survey_list.loc[839, "NO"] = "6b"
# 3, 7, 9 BOLDVENTURE ROAD should be BOLDVENTURE CLOSE
eco4_prospects_survey_list["ADDRESS 1"] = np.where(
(eco4_prospects_survey_list["ADDRESS 1"] == "boldventure road") &
(eco4_prospects_survey_list["NO"].isin([3, 7, 9])),
"boldventure close",
eco4_prospects_survey_list["ADDRESS 1"]
)
eco4_prospects_survey_list["ADDRESS 1"] = np.where(
(eco4_prospects_survey_list["ADDRESS 1"] == "old farm road") & (
eco4_prospects_survey_list["POSTCODE"] == "PL5 1EP"),
"old school road",
eco4_prospects_survey_list["ADDRESS 1"]
)
eco4_prospects_survey_list["ADDRESS 1"] = np.where(
(eco4_prospects_survey_list["ADDRESS 1"] == "croft orchard") & (
eco4_prospects_survey_list["POSTCODE"] == "TQ12 6RP") & (
eco4_prospects_survey_list["NO"] == 52),
"drum way",
eco4_prospects_survey_list["ADDRESS 1"]
)
# String replace
eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(
"the gulls, collaton road", "the gulls collaton road"
)
eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(
"crows-an-eglose", "crows-an-eglos"
)
# We have a high volume of rows that do not match
matched = []
nomatch = []
for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
# Not in the asset list
if (row["ADDRESS 1"] == "berry park") and row["NO"] in [40, 42] and row["POSTCODE"] == "PL12 6EN":
nomatch.append(row.to_dict())
continue
# Not in the asset list
if (row["ADDRESS 1"] == "roberts road") and row["NO"] == 23 and row["POSTCODE"] == "PL5 1DP":
nomatch.append(row.to_dict())
continue
# Not in the asset list
if row["ADDRESS 1"] in [
"kaynton mead", "broadmoor lane", "hoopers barton", "ecos court", "selwood road",
"castle street"
]:
nomatch.append(row.to_dict())
continue
house_number = row["NO"]
if isinstance(house_number, str):
house_number = house_number.lower()
if "flat" in house_number:
house_number = house_number.split("flat")[1].strip()
# Filter on the first line of the address
df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy()
if house_number is not None:
if df.shape[0] != 1:
df = df[df["T1_Address"].str.lower().str.contains(str(house_number))]
if df.shape[0] != 1:
if house_number is not None:
df = df[df["HouseNo"] == str(house_number)]
if df.shape[0] != 1:
if row["POSTCODE"] is not None:
df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
if df.shape[0] != 1:
nomatch.append(row.to_dict())
continue
matched.append(
{
"survey_key": row["survey_key"],
"matched_address": df["T1_Address"].values[0],
"survey_house_no": row["NO"],
"survey_street_name": row["ADDRESS 1"],
"survey_postcode": row["POSTCODE"],
}
)
nomatch = pd.DataFrame(nomatch)
matched = pd.DataFrame(matched)
matched["warmfront_identified"] = True
# Combine asset list and surveys
data = asset_list.merge(
matched, how="left", left_on="T1_Address", right_on="matched_address",
)
data["warmfront_identified"] = data["warmfront_identified"].fillna(False)
lost_identified_properties = eco4_prospects_survey_list[
~eco4_prospects_survey_list["survey_key"].isin(matched["survey_key"])
]
return data, eco4_prospects_survey_list, lost_identified_properties
def map_year_to_age_band(year):
try:
year = int(year)
except ValueError:
return "Invalid Year" # Or any other way you want to handle invalid inputs
if year < 1900:
return "England and Wales: before 1900"
elif 1900 <= year <= 1929:
return "England and Wales: 1900-1929"
elif 1930 <= year <= 1949:
return "England and Wales: 1930-1949"
elif 1950 <= year <= 1966:
return "England and Wales: 1950-1966"
elif 1967 <= year <= 1975:
return "England and Wales: 1967-1975"
elif 1976 <= year <= 1982:
return "England and Wales: 1976-1982"
elif 1983 <= year <= 1990:
return "England and Wales: 1983-1990"
elif 1991 <= year <= 1995:
return "England and Wales: 1991-1995"
elif 1996 <= year <= 2002:
return "England and Wales: 1996-2002"
elif 2003 <= year <= 2006:
return "England and Wales: 2003-2006"
elif 2007 <= year <= 2011:
return "England and Wales: 2007-2011"
else: # Assuming all remaining years are 2012 onwards
return "England and Wales: 2012 onwards"
def get_epc_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds):
scoring_data = []
results = []
nodata = []
property_type_lookup = {
"Flat": {"property-type": "Flat", "built-form": None},
"Mid Terrace House": {"property-type": "House", "built-form": "Mid-Terrace"},
"End Terrace House": {"property-type": "House", "built-form": "End-Terrace"},
"Maisonnette": {"property-type": "Flat", "built-form": None},
"Semi Detached House": {"property-type": "House", "built-form": "Semi-Detached"},
"Detached House": {"property-type": "House", "built-form": "Detached"},
"Coach House": {"property-type": "House", "built-form": "Detached"},
"Bungalow": {"property-type": "Bungalow", "built-form": None},
"Detached Bungalow": {"property-type": "Bungalow", "built-form": "Detached"},
"House": {"property-type": "House", "built-form": None},
"Semi Detached Bung": {"property-type": "Bungalow", "built-form": "Semi-Detached"},
"Bedspace": {"property-type": None, "built-form": None},
"Office Buildings": {"property-type": None, "built-form": None},
"End Terrace Bungalow": {"property-type": "Bungalow", "built-form": "End-Terrace"},
"Mid Terrace Bungalow": {"property-type": "Bungalow", "built-form": "Mid-Terrace"},
"Bedsit": {"property-type": "Flat", "built-form": None},
"Mid Terrace Housekeeping": {"property-type": "House", "built-form": "Mid-Terrace"},
"Mid Terrace Housekeeping ": {"property-type": "House", "built-form": "Mid-Terrace"},
"End Terrace Housex": {"property-type": "House", "built-form": "End-Terrace"},
"Guest Room": {"property-type": None, "built-form": None}
}
for _, property_meta in tqdm(data, total=len(data)):
searcher = SearchEpc(
address1=property_meta["HouseNo"],
postcode=property_meta["postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key=None,
full_address=property_meta["address"]
)
searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["T1_AssetType"]][
"property-type"]
searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["T1_AssetType"]]["built-form"]
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
nodata.append(property_meta)
continue
if searcher.newest_epc.get("estimated"):
# We insert the row ID as our proxy for UPRN
proxy_uprn = int(property_meta["row_id"].split("_")[1])
searcher.newest_epc["uprn"] = proxy_uprn
newest_epc = searcher.newest_epc
older_epcs = searcher.older_epcs
full_sap_epc = searcher.full_sap_epc
# We also want to get the penultimate epc
# penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
# if not penultimate_epc:
# penultimate_epc = newest_epc
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront):
# eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
# eligibility.check_gbis_warmfront()
# eligibility.check_eco4_warmfront()
# # If this is the case, we need to update the older epcs
# # We don't update just to make data cleaning easier
# if penultimate_epc.get("estimated") is None:
# older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]]
# If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity
# Loft MUST be suitable
cavity_age = None
if (
eligibility.walls["is_cavity_wall"] and
eligibility.walls["is_filled_cavity"] and
eligibility.loft["suitability"] and
eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age"
):
# We check the age of the cavity and if it's particularly old, we flag it
cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned)
# Full checks
eligibility.check_gbis()
eligibility.check_eco4()
if eligibility.eco4_warmfront["eligible"]:
if eligibility.epc["uprn"] in ["", None]:
eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
if eligibility.epc["construction-age-band"] in ["", None]:
eligibility.epc["construction-age-band"] = map_year_to_age_band(property_meta["Build Yr"])
# This is not the right place to do this but this is temp
if eligibility.epc["extension-count"] in ["", None]:
eligibility.epc["extension-count"] = 0
# Not in the right place but temp
if eligibility.epc["built-form"] in ["", None]:
if not older_epcs:
eligibility.epc["built-form"] = "Mid-Terrace"
scoring_dictionary = prepare_model_data_row(
property_id=property_meta["row_id"],
modelling_epc=eligibility.epc,
cleaned=cleaned,
cleaning_data=cleaning_data,
created_at=created_at,
old_data=older_epcs,
full_sap_epc=full_sap_epc,
photo_supply_lookup=photo_supply_lookup,
floor_area_decile_thresholds=floor_area_decile_thresholds,
)
scoring_data.extend(scoring_dictionary)
results.append(
{
"row_id": property_meta["row_id"],
"uprn": eligibility.epc["uprn"],
"Address": property_meta["T1_Address"],
"Postcode": property_meta["postcode"],
"property_type": eligibility.epc["property-type"],
"gbis_eligible": eligibility.gbis_warmfront,
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
"eco4_message": eligibility.eco4_warmfront["message"],
"sap": float(eligibility.epc["current-energy-efficiency"]),
"gbis_eligible_future": eligibility.gbis["eligible"],
"gbis_eligible_future_message": eligibility.gbis["message"],
"eco4_eligible_future": eligibility.eco4["eligible"],
"eco4_eligible_future_message": eligibility.eco4["message"],
# Property components
"roof": eligibility.roof["clean_description"],
"walls": eligibility.walls["clean_description"],
"cavity_type": eligibility.cavity["type"],
"heating": eligibility.epc["mainheat-description"],
"tenure": eligibility.tenure,
"date_epc": eligibility.epc["lodgement-date"],
"cavity_age": cavity_age,
**eligibility.walls,
**eligibility.roof,
}
)
scoring_df = pd.DataFrame(scoring_data)
# Perform the same cleaning as in the model - first clean number of room variables though
scoring_df = DataProcessor.apply_averages_cleaning(
data_to_clean=scoring_df,
cleaning_data=cleaning_data,
cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
)
scoring_df = DataProcessor.apply_averages_cleaning(
data_to_clean=scoring_df,
cleaning_data=cleaning_data,
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
).drop(columns=["LOCAL_AUTHORITY"])
scoring_df = DataProcessor.clean_missings_after_description_process(
scoring_df,
ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
"insulation_thickness" in c) or ("ENERGY_EFF" in c)]
)
scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
all_predictions = model_api.predict_all(
df=scoring_df,
bucket="retrofit-data-dev",
prediction_buckets={
"sap_change_predictions": "retrofit-sap-predictions-dev",
"heat_demand_predictions": "retrofit-heat-predictions-dev",
"carbon_change_predictions": "retrofit-carbon-predictions-dev"
}
)
predictions = all_predictions["sap_change_predictions"].copy()
results_df = pd.DataFrame(results)
predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
results_df[["row_id", "sap"]], how="left", on="row_id"
)
predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
results_df = results_df.merge(
predictions[["sap_uplift", "row_id"]],
how="left",
on="row_id"
)
results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
eligibility_assessment = []
for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
# The upgrade requirements are dependent on the current SAP
# If the property is an F or G, it only needs to upgrade to an %
if row["sap"] <= 38:
if row["post_install_sap"] >= 57:
eligibility_classification = "highest confidence"
elif row["post_install_sap"] >= 55:
eligibility_classification = "high confidence"
elif row["post_install_sap"] >= 53:
eligibility_classification = "medium confidence"
else:
eligibility_classification = "unlikely"
else:
if row["post_install_sap"] >= 71:
eligibility_classification = "highest confidence"
elif row["post_install_sap"] >= 69:
eligibility_classification = "high confidence"
elif row["post_install_sap"] >= 67:
eligibility_classification = "medium confidence"
else:
eligibility_classification = "unlikely"
eligibility_assessment.append(
{
"row_id": row["row_id"],
"eligibility_classification": eligibility_classification
}
)
eligibility_assessment = pd.DataFrame(eligibility_assessment)
results_df = results_df.merge(
eligibility_assessment, how="left", on="row_id"
)
return results_df, scoring_data, nodata
def get_epc_data_for_lost_surveys(
lost_identified_properties, cleaned, cleaning_data, created_at, photo_supply_lookup,
floor_area_decile_thresholds
):
lost_identified_properties["row_id"] = [
"lost_surveys_ha25_" + str(i) for i in range(0, len(lost_identified_properties))
]
scoring_data = []
results = []
nodata = []
property_type_lookup = {
"MID-TERRACE": {"property-type": "House", "built-form": "Mid-Terrace"},
"N/A": {"property-type": "House", "built-form": None},
"END-TERRACE": {"property-type": "House", "built-form": "End-Terrace"},
"GROUND-FLOOR": {"property-type": "House", "built-form": None},
"TOP-FLOOR": {"property-type": "House", "built-form": None},
"SEMI-DETACHED": {"property-type": "House", "built-form": "Semi-Detached"},
"MID-FLOOR": {"property-type": "House", "built-form": None},
"TOP-FLOOR FLAT": {"property-type": "House", "built-form": None},
"DETACHED": {"property-type": "House", "built-form": "Detached"},
"MID-FLOOR FLAT": {"property-type": "House", "built-form": None},
"SEMI- DETACHED": {"property-type": "House", "built-form": "Semi-Detached"},
"NO EPC ON GOV": {"property-type": "House", "built-form": None},
"Top-floor flat": {"property-type": "House", "built-form": None},
"GROUND-FLOOR FLAT": {"property-type": "House", "built-form": None},
"NOT ON GOV SITE": {"property-type": "House", "built-form": None}
}
for _, property_meta in tqdm(lost_identified_properties.iterrows(), total=len(lost_identified_properties)):
if property_meta["POSTCODE"] is None:
continue
full_address = ", ".join(
[str(x) for x in [
property_meta["NO"], property_meta["ADDRESS 1"], property_meta["ADDRESS 2"], property_meta["ADDRESS 3"]
] if x is not None]
)
searcher = SearchEpc(
address1=str(property_meta["NO"]),
postcode=property_meta["POSTCODE"],
auth_token=EPC_AUTH_TOKEN,
os_api_key=None,
full_address=full_address
)
property_type_key = property_meta["PROPERTY TYPE"]
if property_type_key is not None:
searcher.ordnance_survey_client.property_type = property_type_lookup[property_type_key.strip()][
"property-type"]
searcher.ordnance_survey_client.built_form = property_type_lookup[property_type_key.strip()][
"built-form"]
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
nodata.append(property_meta)
continue
if searcher.newest_epc.get("estimated"):
# We insert the row ID as our proxy for UPRN
proxy_uprn = int(property_meta["row_id"].split("_")[-1])
searcher.newest_epc["uprn"] = proxy_uprn
newest_epc = searcher.newest_epc
older_epcs = searcher.older_epcs
full_sap_epc = searcher.full_sap_epc
# We also want to get the penultimate epc
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
if not penultimate_epc:
penultimate_epc = newest_epc
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront):
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# If this is the case, we need to update the older epcs
# We don't update just to make data cleaning easier
if penultimate_epc.get("estimated") is None:
older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]]
# Full checks
eligibility.check_gbis()
eligibility.check_eco4()
if eligibility.eco4_warmfront["eligible"] & (eligibility.epc["construction-age-band"] not in ["", None]):
if eligibility.epc["uprn"] in ["", None]:
eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1])
scoring_dictionary = prepare_model_data_row(
property_id=property_meta["row_id"],
modelling_epc=eligibility.epc,
cleaned=cleaned,
cleaning_data=cleaning_data,
created_at=created_at,
old_data=older_epcs,
full_sap_epc=full_sap_epc,
photo_supply_lookup=photo_supply_lookup,
floor_area_decile_thresholds=floor_area_decile_thresholds,
)
scoring_data.extend(scoring_dictionary)
results.append(
{
"row_id": property_meta["row_id"],
"uprn": eligibility.epc["uprn"],
"Address": property_meta["ADDRESS 1"],
"Postcode": property_meta["POSTCODE"],
"property_type": eligibility.epc["property-type"],
"gbis_eligible": eligibility.gbis_warmfront,
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
"eco4_message": eligibility.eco4_warmfront["message"],
"sap": float(eligibility.epc["current-energy-efficiency"]),
"gbis_eligible_future": eligibility.gbis["eligible"],
"gbis_eligible_future_message": eligibility.gbis["message"],
"eco4_eligible_future": eligibility.eco4["eligible"],
"eco4_eligible_future_message": eligibility.eco4["message"],
# Property components
"roof": eligibility.roof["clean_description"],
"walls": eligibility.walls["clean_description"],
"cavity_type": eligibility.cavity["type"],
"heating": eligibility.epc["mainheat-description"],
"tenure": eligibility.tenure,
"date_epc": eligibility.epc["lodgement-date"],
**eligibility.walls,
**eligibility.roof,
}
)
scoring_df = pd.DataFrame(scoring_data)
# Perform the same cleaning as in the model - first clean number of room variables though
scoring_df = DataProcessor.apply_averages_cleaning(
data_to_clean=scoring_df,
cleaning_data=cleaning_data,
cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
)
scoring_df = DataProcessor.apply_averages_cleaning(
data_to_clean=scoring_df,
cleaning_data=cleaning_data,
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
).drop(columns=["LOCAL_AUTHORITY"])
scoring_df = DataProcessor.clean_missings_after_description_process(
scoring_df,
ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
"insulation_thickness" in c) or ("ENERGY_EFF" in c)]
)
scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
scoring_df["UPRN"] = scoring_df["UPRN"].astype(int)
model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at)
all_predictions = model_api.predict_all(
df=scoring_df,
bucket="retrofit-data-dev",
prediction_buckets={
"sap_change_predictions": "retrofit-sap-predictions-dev",
"heat_demand_predictions": "retrofit-heat-predictions-dev",
"carbon_change_predictions": "retrofit-carbon-predictions-dev"
}
)
predictions = all_predictions["sap_change_predictions"].copy()
results_df = pd.DataFrame(results)
predictions = predictions.rename(columns={"property_id": "row_id"}).merge(
results_df[["row_id", "sap"]], how="left", on="row_id"
)
predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"]
predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index()
results_df = results_df.merge(
predictions[["sap_uplift", "row_id"]],
how="left",
on="row_id"
)
results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"]
eligibility_assessment = []
for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
# The upgrade requirements are dependent on the current SAP
# If the property is an F or G, it only needs to upgrade to an %
if row["sap"] <= 38:
if row["post_install_sap"] >= 57:
eligibility_classification = "highest confidence"
elif row["post_install_sap"] >= 55:
eligibility_classification = "high confidence"
elif row["post_install_sap"] >= 53:
eligibility_classification = "medium confidence"
else:
eligibility_classification = "unlikely"
else:
if row["post_install_sap"] >= 71:
eligibility_classification = "highest confidence"
elif row["post_install_sap"] >= 69:
eligibility_classification = "high confidence"
elif row["post_install_sap"] >= 67:
eligibility_classification = "medium confidence"
else:
eligibility_classification = "unlikely"
eligibility_assessment.append(
{
"row_id": row["row_id"],
"eligibility_classification": eligibility_classification
}
)
eligibility_assessment = pd.DataFrame(eligibility_assessment)
results_df = results_df.merge(
eligibility_assessment, how="left", on="row_id"
)
return results_df, scoring_data, nodata
def analyse_results(results_df, data, eco4_prospects_survey_list):
analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge(
results_df, how="left", on="row_id"
)
analysis_data = analysis_data.merge(
eco4_prospects_survey_list[["survey_key", "ADDRESS 1", "NO", "POSTCODE"]],
how="left", on="survey_key"
)
# NEW
analysis_data["roof_insulation_thickness"] = np.where(
pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"]
)
analysis_data["roof_insulation_thickness_numeric"] = analysis_data["roof_insulation_thickness"].apply(
lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True)
)
warmfront_identified = analysis_data[
(analysis_data["warmfront_identified"] == True)
] # 2204
# Because we don't know which property is for which scheme, we'll just look at what we found
ideal_eco4 = analysis_data[
(analysis_data["eco4_eligible"] == True) &
(analysis_data["roof_insulation_thickness_numeric"] <= 100) &
(analysis_data["sap"] <= 54)
] # 335
gbis = analysis_data[
(analysis_data["gbis_eligible"] == True) &
~analysis_data["row_id"].isin(ideal_eco4["row_id"].values)
]
ideal_eco4 = ideal_eco4[ideal_eco4["sap"] <= 54]
def analyse_lost_surveys(results_df):
results_df["roof_insulation_thickness"] = np.where(
pd.isnull(results_df["roof_insulation_thickness"]), None, results_df["roof_insulation_thickness"]
)
results_df["roof_insulation_thickness_numeric"] = results_df["roof_insulation_thickness"].apply(
lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True)
)
ideal_eco4 = results_df[
(results_df["eco4_eligible"] == True) &
(results_df["roof_insulation_thickness_numeric"] <= 100) &
(results_df["sap"] <= 54)
] # 25
gbis = results_df[
(results_df["gbis_eligible"] == True) &
~results_df["row_id"].isin(ideal_eco4["row_id"].values)
] # 82
def app():
data, eco4_prospects_survey_list, lost_identified_properties = load_data()
data["row_id"] = ["ha25_" + str(i) for i in range(0, len(data))]
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
cleaning_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
)
created_at = datetime.now().isoformat()
photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
results_df, scoring_data, nodata = get_epc_data(
data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds
)
# Pickle the outputs
# Old data was ha25.pickle
# import pickle
# with open("ha25_10_jan.pickle", "wb") as f:
# pickle.dump(
# {
# "results_df": results_df,
# "scoring_data": scoring_data,
# "nodata": nodata
# },
# f
# )
# Load in pickle
import pickle
with open("ha25_10_jan.pickle", "rb") as f:
saved = pickle.load(f)
results_df = saved["results_df"]
scoring_data = saved["scoring_data"]
nodata = saved["nodata"]