mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
completed ha15
This commit is contained in:
parent
739afbd79b
commit
736e02cb4a
2 changed files with 405 additions and 18 deletions
|
|
@ -46,10 +46,17 @@ class Eligibility:
|
|||
return remapped[0]
|
||||
|
||||
if key == "walls-description":
|
||||
return WallAttributes(self.epc["walls-description"]).process()
|
||||
cleaner_cls = WallAttributes(self.epc["roof-description"])
|
||||
|
||||
if key == "roof-description":
|
||||
return RoofAttributes(self.epc["roof-description"]).process()
|
||||
elif key == "roof-description":
|
||||
cleaner_cls = RoofAttributes(self.epc["roof-description"])
|
||||
|
||||
else:
|
||||
raise ValueError("Invalid key")
|
||||
output = cleaner_cls.process()
|
||||
output["clean_description"] = cleaner_cls.description.replace("(assumed)", "").rstrip().capitalize()
|
||||
|
||||
return output
|
||||
|
||||
def loft_insulation(self, loft_thickness_threshold: int = None):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ This process has been created to compare the model based eligibility process aga
|
|||
used by the Warmfront team, to identify which properties are eligible for ECO4 and GBIS funding. This
|
||||
work is being done in December 2023, prior to completion of acquisition
|
||||
"""
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
|
@ -18,6 +19,8 @@ from etl.eligibility.Eligibility import Eligibility
|
|||
from etl.epc.DataProcessor import DataProcessor
|
||||
from backend.app.utils import read_parquet_from_s3
|
||||
from backend.app.plan.utils import create_recommendation_scoring_data
|
||||
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
||||
from backend.ml_models.api import ModelApi
|
||||
|
||||
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
|
||||
|
||||
|
|
@ -462,6 +465,9 @@ def get_ha_32data(ha_data, cleaned, cleaning_data, created_at):
|
|||
"warmfront_identified": house["identified"],
|
||||
"gbis_eligible": None,
|
||||
"eco4_eligible": None,
|
||||
"sap": None,
|
||||
"roof": None,
|
||||
"walls": None,
|
||||
"date_epc": None,
|
||||
"message": "No EPC found",
|
||||
}
|
||||
|
|
@ -481,13 +487,18 @@ def get_ha_32data(ha_data, cleaned, cleaning_data, created_at):
|
|||
eligibility.check_eco4()
|
||||
|
||||
# If there is no eligibility, we need to check the penultimate epc
|
||||
if (not eligibility.eco4["eligible"]) and (not eligibility.gbis):
|
||||
# However, we only check the penultimate epc if the property is identified
|
||||
# This is because if the property was identified, it's possible that the newest EPC is a post-retrofit
|
||||
# EPC, which would mean that the penultimate EPC is the pre-retrofit EPC
|
||||
# However, if the property HAS been identified, we don't want to check the penultimate EPC since
|
||||
# The newest EPC will reflect the current state of the home and therefore we determine if there is a new
|
||||
# opportunity for retrofit
|
||||
if (not eligibility.eco4["eligible"]) and (not eligibility.gbis) and (house["identified"]):
|
||||
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
|
||||
eligibility.check_gbis()
|
||||
eligibility.check_eco4()
|
||||
|
||||
if eligibility.eco4["eligible"]:
|
||||
# TODO: Check me
|
||||
scoring_dictionary = prepare_model_data_row(
|
||||
property_id=house["row_id"],
|
||||
modelling_epc=eligibility.epc,
|
||||
|
|
@ -502,6 +513,9 @@ def get_ha_32data(ha_data, cleaned, cleaning_data, created_at):
|
|||
"warmfront_identified": house["identified"],
|
||||
"gbis_eligible": eligibility.gbis,
|
||||
"eco4_eligible": eligibility.eco4["eligible"],
|
||||
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
||||
"roof": eligibility.roof["clean_description"],
|
||||
"walls": eligibility.walls["clean_description"],
|
||||
"date_epc": eligibility.epc["lodgement-date"],
|
||||
"message": "eco4 conditional on post sap",
|
||||
}
|
||||
|
|
@ -519,22 +533,235 @@ def get_ha_32data(ha_data, cleaned, cleaning_data, created_at):
|
|||
"warmfront_identified": house["identified"],
|
||||
"gbis_eligible": eligibility.gbis,
|
||||
"eco4_eligible": eligibility.eco4["eligible"],
|
||||
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
||||
"roof": eligibility.roof["clean_description"],
|
||||
"walls": eligibility.walls["clean_description"],
|
||||
"date_epc": eligibility.epc["lodgement-date"],
|
||||
"message": None
|
||||
}
|
||||
)
|
||||
|
||||
logger.info("no_house_numbers")
|
||||
return results, scoring_data, no_house_numbers
|
||||
|
||||
return results, scoring_data
|
||||
|
||||
def get_ha_15data(ha_data, cleaned, cleaning_data, created_at):
|
||||
house_number_key = None
|
||||
address_key = "Address Line 1"
|
||||
postcode_key = "Postcode"
|
||||
house_name = None
|
||||
house_type_key = "Property Type"
|
||||
|
||||
house_type_lookup = {
|
||||
"Bungalow": "Bungalow",
|
||||
"Flat": "Flat",
|
||||
'House': "House",
|
||||
'Flat over garage': "Flat",
|
||||
'Maisonette': "Maisonette",
|
||||
}
|
||||
|
||||
scoring_data = []
|
||||
results = []
|
||||
no_house_numbers = []
|
||||
for _, house in tqdm(ha_data.iterrows(), total=len(ha_data)):
|
||||
|
||||
# If we don't have a house number, we'll continue since we won't realistically be able to find
|
||||
# an address
|
||||
if house_number_key is not None:
|
||||
if pd.isnull(house[house_number_key]):
|
||||
no_house_numbers.append(house["row_id"])
|
||||
continue
|
||||
|
||||
if house_name is not None:
|
||||
if not pd.isnull(house[house_name]):
|
||||
address1 = " ".join([house[house_name], house[house_number_key], house[address_key]])
|
||||
else:
|
||||
address1 = " ".join([house[house_number_key], house[address_key]])
|
||||
else:
|
||||
address1 = house[address_key]
|
||||
|
||||
searcher = SearchEpc(
|
||||
address1=address1,
|
||||
postcode=house[postcode_key]
|
||||
)
|
||||
|
||||
response = searcher.search()
|
||||
if response["status"] == 204:
|
||||
# If the property is identified, we should fix this
|
||||
# if house["identified"]:
|
||||
# raise NotImplementedError("Check if we have an epc")
|
||||
results.append(
|
||||
{
|
||||
"row_id": house["row_id"],
|
||||
"warmfront_identified": house["identified"],
|
||||
"gbis_eligible": None,
|
||||
"eco4_eligible": None,
|
||||
"sap": None,
|
||||
"roof": None,
|
||||
"walls": None,
|
||||
"date_epc": None,
|
||||
"message": "No EPC found",
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
newest_epc, older_epcs, _ = searcher.retrieve(
|
||||
property_type=house_type_lookup.get(house[house_type_key], None)
|
||||
)
|
||||
# We also want to get the penultimate epc
|
||||
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
|
||||
if not penultimate_epc:
|
||||
penultimate_epc = newest_epc
|
||||
|
||||
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
|
||||
eligibility.check_gbis()
|
||||
eligibility.check_eco4()
|
||||
|
||||
# If there is no eligibility, we need to check the penultimate epc
|
||||
# However, we only check the penultimate epc if the property is identified
|
||||
# This is because if the property was identified, it's possible that the newest EPC is a post-retrofit
|
||||
# EPC, which would mean that the penultimate EPC is the pre-retrofit EPC
|
||||
# However, if the property HAS been identified, we don't want to check the penultimate EPC since
|
||||
# The newest EPC will reflect the current state of the home and therefore we determine if there is a new
|
||||
# opportunity for retrofit
|
||||
if (not eligibility.eco4["eligible"]) and (not eligibility.gbis) and (house["identified"]):
|
||||
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
|
||||
eligibility.check_gbis()
|
||||
eligibility.check_eco4()
|
||||
|
||||
if eligibility.eco4["eligible"]:
|
||||
scoring_dictionary = prepare_model_data_row(
|
||||
property_id=house["row_id"],
|
||||
modelling_epc=eligibility.epc,
|
||||
cleaned=cleaned,
|
||||
cleaning_data=cleaning_data,
|
||||
created_at=created_at
|
||||
)
|
||||
scoring_data.append(scoring_dictionary)
|
||||
results.append(
|
||||
{
|
||||
"row_id": house["row_id"],
|
||||
"warmfront_identified": house["identified"],
|
||||
"gbis_eligible": eligibility.gbis,
|
||||
"eco4_eligible": eligibility.eco4["eligible"],
|
||||
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
||||
"roof": eligibility.roof["clean_description"],
|
||||
"walls": eligibility.walls["clean_description"],
|
||||
"date_epc": eligibility.epc["lodgement-date"],
|
||||
"message": "eco4 conditional on post sap",
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# if (house["identified"] and not eligibility.gbis) and (
|
||||
# house["identified"] and not eligibility.eco4["eligible"]):
|
||||
# raise NotImplementedError("Investigate ms")
|
||||
|
||||
# If nothing is eligible or gbis is eligible, then we make a record this
|
||||
results.append(
|
||||
{
|
||||
"row_id": house["row_id"],
|
||||
"warmfront_identified": house["identified"],
|
||||
"gbis_eligible": eligibility.gbis,
|
||||
"eco4_eligible": eligibility.eco4["eligible"],
|
||||
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
||||
"roof": eligibility.roof["clean_description"],
|
||||
"walls": eligibility.walls["clean_description"],
|
||||
"date_epc": eligibility.epc["lodgement-date"],
|
||||
"message": None
|
||||
}
|
||||
)
|
||||
|
||||
# with open("ha_15_outputs.pickle", "rb") as f:
|
||||
# results_dict = pickle.load(f)
|
||||
# results = results_dict["results"]
|
||||
# scoring_data = results_dict["scoring_data"]
|
||||
# no_house_numbers = results_dict["no_house_numbers"]
|
||||
|
||||
scoring_df = pd.DataFrame(scoring_data)
|
||||
# Implement the same process that is being used in the recommendation engine to cleaning scoring_df
|
||||
|
||||
# Perform the same cleaning as in the model - first clean number of room variables though
|
||||
scoring_df = DataProcessor.apply_averages_cleaning(
|
||||
data_to_clean=scoring_df,
|
||||
cleaning_data=cleaning_data,
|
||||
cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
|
||||
colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
||||
)
|
||||
|
||||
scoring_df = DataProcessor.apply_averages_cleaning(
|
||||
data_to_clean=scoring_df,
|
||||
cleaning_data=cleaning_data,
|
||||
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
|
||||
).drop(columns=["LOCAL_AUTHORITY"])
|
||||
|
||||
scoring_df = DataProcessor.clean_missings_after_description_process(
|
||||
scoring_df,
|
||||
ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
|
||||
"insulation_thickness" in c) or ("ENERGY_EFF" in c)]
|
||||
)
|
||||
|
||||
scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
|
||||
|
||||
model_api = ModelApi(portfolio_id="ha32-eligibility", timestamp=created_at)
|
||||
all_predictions = model_api.predict_all(
|
||||
df=scoring_df,
|
||||
bucket="retrofit-data-dev",
|
||||
prediction_buckets={
|
||||
"sap_change_predictions": "retrofit-sap-predictions-dev",
|
||||
"heat_demand_predictions": "retrofit-heat-predictions-dev",
|
||||
"carbon_change_predictions": "retrofit-carbon-predictions-dev"
|
||||
}
|
||||
)
|
||||
|
||||
# merge the predictions onto the scoring_df
|
||||
predictions = all_predictions["sap_change_predictions"]
|
||||
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
results_df = results_df.merge(
|
||||
predictions[["predictions", "property_id"]].rename(
|
||||
columns={"predictions": "post_install_sap", "property_id": "row_id"}
|
||||
),
|
||||
how="left",
|
||||
on="row_id"
|
||||
)
|
||||
|
||||
# Our methodology for identifying properties is to use the post-install SAP score
|
||||
# We produce the following classifications, which accomodate the fact that the model can be wrong
|
||||
# 1) If the post-install SAP score is above 71, we say the property is eligible and we hve high confidence
|
||||
# 2) If the post-install SAP score is above 69, we say that the property is eligible
|
||||
# 3) If the post-install SAP score is above 67, we say that the property is eligible, but we are not confident
|
||||
# 4) If the post-install SAP score is below 67, we say that the property is unlikely to be eligible
|
||||
|
||||
eligibility_assessment = []
|
||||
for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
|
||||
|
||||
if row["post_install_sap"] >= 71:
|
||||
eligibility_classification = "highest confidence"
|
||||
elif row["post_install_sap"] >= 69:
|
||||
eligibility_classification = "high confidence"
|
||||
elif row["post_install_sap"] >= 67:
|
||||
eligibility_classification = "medium confidence"
|
||||
else:
|
||||
eligibility_classification = "unlikely"
|
||||
|
||||
eligibility_assessment.append(
|
||||
{
|
||||
"row_id": row["row_id"],
|
||||
"eligibility_classification": eligibility_classification
|
||||
}
|
||||
)
|
||||
|
||||
eligibility_assessment = pd.DataFrame(eligibility_assessment)
|
||||
|
||||
results_df = results_df.merge(
|
||||
eligibility_assessment, how="left", on="row_id"
|
||||
)
|
||||
|
||||
return results_df, scoring_df, no_house_numbers
|
||||
|
||||
|
||||
def analyse_ha_32_results(results, ha32, no_house_numbers):
|
||||
results_df = pd.DataFrame(results)
|
||||
import pickle
|
||||
# with open("ha_32_results.pickle", "wb") as f:
|
||||
# pickle.dump(results_df, f)
|
||||
|
||||
"""
|
||||
We want to know:
|
||||
1) What proportion of identified properties we get correct
|
||||
|
|
@ -544,6 +771,8 @@ def analyse_ha_32_results(results, ha32, no_house_numbers):
|
|||
For HA32, most of these (if not all) properties were identified under gbis
|
||||
"""
|
||||
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
# What proportio
|
||||
warmfront_identified = results_df[
|
||||
results_df["warmfront_identified"]
|
||||
|
|
@ -552,9 +781,23 @@ def analyse_ha_32_results(results, ha32, no_house_numbers):
|
|||
success_rate = warmfront_identified["gbis_eligible"].sum() / warmfront_identified.shape[0]
|
||||
# For HA32, this is 89%
|
||||
|
||||
# missed = results_df[
|
||||
# results_df["warmfront_identified"] & (warmfront_identified["gbis_eligible"] != True)
|
||||
# ]
|
||||
missed = results_df[
|
||||
results_df["warmfront_identified"] & (warmfront_identified["gbis_eligible"] != True)
|
||||
]
|
||||
|
||||
sap_too_high = missed[
|
||||
missed["sap"] >= 69
|
||||
]
|
||||
|
||||
sap_low_enough = missed[
|
||||
missed["sap"] < 69
|
||||
]
|
||||
|
||||
investigate_1 = ha32[ha32["row_id"].isin(sap_too_high["row_id"])][
|
||||
["row_id", "Postcode", "Address", "Dwelling num", "Street"]]
|
||||
|
||||
investigate_2 = ha32[ha32["row_id"].isin(sap_low_enough["row_id"])][
|
||||
["row_id", "Postcode", "Address", "Dwelling num", "Street"]]
|
||||
|
||||
# to_check = missed[pd.isnull(missed["message"])]
|
||||
|
||||
|
|
@ -605,7 +848,109 @@ def analyse_ha_32_results(results, ha32, no_house_numbers):
|
|||
if no_house_numbers_ha32:
|
||||
logger.error("We have some identified properties that have no house numbers - investigate me")
|
||||
|
||||
return success_rate, new_possibilities
|
||||
new = {
|
||||
"n_new_possibilities": new_possibilities.shape[0],
|
||||
"new_possibilities_confidence": new_possibilities["high_confidence"].value_counts()
|
||||
}
|
||||
|
||||
return success_rate, new
|
||||
|
||||
|
||||
def analyse_ha_15_results(results_df, ha15, no_house_numbers):
|
||||
"""
|
||||
We want to know:
|
||||
1) What proportion of identified properties we get correct
|
||||
2) If we miss identified properties, why
|
||||
3) Which properties do we identify that were not identified by warmfront. What is our confidence on these?
|
||||
|
||||
For HA32, most of these (if not all) properties were identified under gbis
|
||||
"""
|
||||
|
||||
# What proportio
|
||||
warmfront_identified = results_df[
|
||||
results_df["warmfront_identified"]
|
||||
]
|
||||
|
||||
n_identified = (warmfront_identified["gbis_eligible"] | warmfront_identified["eco4_eligible"]).sum()
|
||||
|
||||
success_rate = n_identified / warmfront_identified.shape[0]
|
||||
|
||||
eco_identified_confidence = warmfront_identified[warmfront_identified["eco4_eligible"] == True][
|
||||
"eligibility_classification"].value_counts()
|
||||
# For HA15 this is 50.3%
|
||||
|
||||
# of the properties we identify, what is the mix of confidenc
|
||||
|
||||
missed = results_df[
|
||||
results_df["warmfront_identified"] & (
|
||||
(warmfront_identified["gbis_eligible"] != True) & (warmfront_identified["eco4_eligible"] != True)
|
||||
)
|
||||
]
|
||||
|
||||
missed_no_data = missed[missed["message"] == "No EPC found"].shape[0]
|
||||
|
||||
sap_too_high = missed[
|
||||
missed["sap"] >= 69
|
||||
]
|
||||
|
||||
sap_low_enough = missed[
|
||||
missed["sap"] < 69
|
||||
]
|
||||
|
||||
sap_low_enough["walls"].value_counts()
|
||||
|
||||
investigate_1 = ha15[ha15["row_id"].isin(sap_too_high["row_id"])][
|
||||
["row_id", "Postcode", "Address Line 1", "Address Line 2", "Address Line 3"]]
|
||||
|
||||
investigate_2 = ha15[ha15["row_id"].isin(sap_low_enough["row_id"])][
|
||||
["row_id", "Postcode", "Address Line 1", "Address Line 2", "Address Line 3"]]
|
||||
|
||||
missed["message"].value_counts()
|
||||
|
||||
# We now look for properties that we identified, that were not identified by Warmfront
|
||||
|
||||
new_possibilities = results_df[
|
||||
(~results_df["warmfront_identified"]) &
|
||||
(results_df["gbis_eligible"] | results_df["eco4_eligible"])
|
||||
].copy()
|
||||
|
||||
# We deem that Any EPC that is produced in the last 3 years gives us good confidence for GBIS
|
||||
cutoff_date = datetime.now() - timedelta(days=3 * 365)
|
||||
|
||||
new_possibilities["high_confidence"] = pd.to_datetime(new_possibilities["date_epc"]) >= cutoff_date
|
||||
|
||||
eco_new_possibilities = new_possibilities["eco4_eligible"].sum()
|
||||
eco_new_possibilities_confidence = new_possibilities[
|
||||
new_possibilities["eco4_eligible"]
|
||||
]["eligibility_classification"].value_counts()
|
||||
|
||||
gbis_new_possibilites = new_possibilities["gbis_eligible"].sum()
|
||||
gbis_new_possibilites_confidence = new_possibilities[
|
||||
new_possibilities["gbis_eligible"]
|
||||
]["high_confidence"].value_counts()
|
||||
|
||||
new = {
|
||||
"new_possibilities": new_possibilities,
|
||||
"eco_new_possibilities": eco_new_possibilities,
|
||||
"eco_new_possibilities_confidence": eco_new_possibilities_confidence,
|
||||
"gbis_new_possibilites": gbis_new_possibilites,
|
||||
"gbis_new_possibilites_confidence": gbis_new_possibilites_confidence
|
||||
}
|
||||
|
||||
identified_results = {
|
||||
"n_identified": n_identified,
|
||||
"success_rate": success_rate,
|
||||
"eco_identified_confidence": eco_identified_confidence
|
||||
}
|
||||
|
||||
missed_results = {
|
||||
"n_missed": missed.shape[0],
|
||||
"n_sap_too_high": sap_too_high.shape[0],
|
||||
"n_sap_low_enough": sap_low_enough.shape[0],
|
||||
"missed_no_data": missed_no_data
|
||||
}
|
||||
|
||||
return success_rate, new, identified_results, missed_results
|
||||
|
||||
|
||||
def app():
|
||||
|
|
@ -631,6 +976,41 @@ def app():
|
|||
|
||||
# We want to retrieve EPCs for every single property
|
||||
# NOTE: HA32 is MOSTLY cavity via GBIS
|
||||
ha_data = ha32
|
||||
|
||||
ha_32_results = get_ha_32data(ha_data, cleaned, cleaning_data, created_at)
|
||||
ha32_results, ha32_scoring_data, ha32_no_house_numbers = get_ha_32data(
|
||||
ha_data=ha32,
|
||||
cleaned=cleaned,
|
||||
cleaning_data=cleaning_data,
|
||||
created_at=created_at
|
||||
)
|
||||
|
||||
# with open("ha32.pickle", "wb") as f:
|
||||
# pickle.dump(
|
||||
# {
|
||||
# "ha32_results": ha32_results,
|
||||
# "ha32_scoring_data": ha32_scoring_data,
|
||||
# "ha32_no_house_numbers": ha32_no_house_numbers
|
||||
# },
|
||||
# f
|
||||
# )
|
||||
|
||||
ha32_success_rate, ha32_new_possibilities = analyse_ha_32_results(
|
||||
results=ha32_results, ha32=ha32, no_house_numbers=ha32_no_house_numbers
|
||||
)
|
||||
|
||||
# HA 15
|
||||
ha15_results_df, ha15_scoring_df, ha15_no_house_numbers = get_ha_15data(ha15, cleaned, cleaning_data, created_at)
|
||||
|
||||
# with open("ha15.pickle", "wb") as f:
|
||||
# pickle.dump(
|
||||
# {
|
||||
# "ha15_results_df": ha15_results_df,
|
||||
# "ha15_scoring_df": ha15_scoring_df,
|
||||
# "ha15_no_house_numbers": ha15_no_house_numbers
|
||||
# },
|
||||
# f
|
||||
# )
|
||||
|
||||
ha15_success_rate, ha15_new, ha15_identified_results, ha15_missed_results = analyse_ha_15_results(
|
||||
ha15_results_df, ha15, ha15_no_house_numbers
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue