mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
1146 lines
46 KiB
Python
1146 lines
46 KiB
Python
"""
|
|
This process has been created to compare the model based eligibility process against the in-person process
|
|
used by the Warmfront team, to identify which properties are eligible for ECO4 and GBIS funding. This
|
|
work is being done in December 2023, prior to completion of acquisition
|
|
"""
|
|
import pickle
|
|
from etl.epc.Record import EPCRecord
|
|
from pathlib import Path
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
import numpy as np
|
|
import msgpack
|
|
from datetime import datetime, timedelta
|
|
from utils.logger import setup_logger
|
|
from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
|
|
from dotenv import load_dotenv
|
|
from backend.SearchEpc import SearchEpc
|
|
from backend.Property import Property
|
|
from etl.eligibility.Eligibility import Eligibility
|
|
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
|
from backend.ml_models.api import ModelApi
|
|
|
|
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
|
|
|
|
logger = setup_logger()
|
|
load_dotenv(ENV_FILE)
|
|
|
|
|
|
def load_data():
|
|
"""
|
|
This function loads the asset lists and identified addresses for HA32 and HA15
|
|
:return:
|
|
"""
|
|
|
|
# Load the asset list
|
|
ha32_asset_list = pd.read_csv("etl/eligibility/ha_15_32/HA32 - ASSET LIST.csv", low_memory=False)
|
|
ha15_asset_list = pd.read_csv("etl/eligibility/ha_15_32/HA15 - ASSET LIST.csv", low_memory=False)
|
|
|
|
# Load the identified addresses
|
|
ha32_identified_addresses = pd.read_csv("etl/eligibility/ha_15_32/HA 32 Identified addresses.csv", low_memory=False)
|
|
ha15_identified_addresses = pd.read_csv("etl/eligibility/ha_15_32/HA 15 Identified addresses.csv", low_memory=False)
|
|
|
|
return ha32_asset_list, ha15_asset_list, ha32_identified_addresses, ha15_identified_addresses
|
|
|
|
|
|
def marge_ha_32(asset_list, identified_addresses):
|
|
"""
|
|
This method merges the asset list onto the list of identified addresses, forming a singular file for ha32
|
|
"""
|
|
|
|
dropped_identified_merge_keys = []
|
|
|
|
# ha32 starts with 1418 rows
|
|
starting_rows = len(asset_list)
|
|
|
|
# We update how the Coxwold are listed in the identified addresses
|
|
identified_addresses["Address"] = np.where(
|
|
identified_addresses["Address"] == "Coxwold",
|
|
"Coxwold Grove",
|
|
identified_addresses["Address"]
|
|
)
|
|
|
|
# Update the Barringhton Avenue with their correct spelling: Barrington Avenue
|
|
identified_addresses["Address"] = np.where(
|
|
identified_addresses["Address"] == "Barringhton Avenue",
|
|
"Barrington Avenue",
|
|
identified_addresses["Address"]
|
|
)
|
|
|
|
# Update how the Rustenburn addresses are listed in the identified addresses
|
|
identified_addresses["Address"] = np.where(
|
|
identified_addresses["Address"] == "Rustenburg",
|
|
"Rustenburg Street",
|
|
identified_addresses["Address"]
|
|
)
|
|
|
|
# Update how the MALIN LODGE, RONALDSWAY CLOSE addresses are listed in the identified addresses
|
|
identified_addresses["Address"] = np.where(
|
|
identified_addresses["Address"] == "MALIN LODGE, RONALDSWAY CLOSE",
|
|
"Malin Lodge",
|
|
identified_addresses["Address"]
|
|
)
|
|
|
|
# Update how the Feroes Close are listed in the identified addresses
|
|
identified_addresses["Address"] = np.where(
|
|
identified_addresses["Address"] == "Feroes Close",
|
|
"Faroes Close",
|
|
identified_addresses["Address"]
|
|
)
|
|
|
|
# Update how 7 Norton grove is listed as it has the wrong postcode
|
|
asset_list["Postcode"] = np.where(
|
|
(asset_list["Street"] == "Norton Grove") & (asset_list["Postcode"] == "HU4 6HQ") & (
|
|
asset_list["Dwelling num"] == "7"),
|
|
"HU4 6HG",
|
|
asset_list["Postcode"]
|
|
)
|
|
|
|
asset_list["merge_key"] = (
|
|
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
)
|
|
|
|
asset_list["merge_key2"] = (
|
|
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
)
|
|
|
|
identified_addresses["merge_key"] = (
|
|
identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
)
|
|
|
|
identified_addresses["merge_key2"] = (
|
|
identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
)
|
|
|
|
identified_dupes = identified_addresses["merge_key"].duplicated()
|
|
if identified_dupes.sum():
|
|
logger.warning("We have %s duplicated identified addresses that will be dropped", identified_dupes.sum())
|
|
dropped_identified_merge_keys.extend(identified_addresses[identified_dupes]["merge_key"].tolist())
|
|
|
|
identified_addresses = identified_addresses.drop_duplicates("merge_key")
|
|
|
|
# Check asset list for dupes
|
|
asset_list_dupes = asset_list["merge_key"].duplicated()
|
|
if asset_list_dupes.sum():
|
|
logger.warning(
|
|
"We have some duplicated asset list rows - they won't be dropped but we make sure they aren't in the "
|
|
"identified addresses"
|
|
)
|
|
dupe_keys = asset_list[asset_list["merge_key"].duplicated()]["merge_key"].tolist()
|
|
|
|
check = identified_addresses[identified_addresses.merge_key.isin(dupe_keys)]
|
|
if not check.empty:
|
|
raise ValueError("We have a problem here, investigate me")
|
|
|
|
# Merge the asset list onto the identified addresses
|
|
merged_data = pd.merge(
|
|
asset_list,
|
|
identified_addresses.drop(columns="merge_key2"),
|
|
how="left",
|
|
left_on="merge_key",
|
|
right_on="merge_key",
|
|
suffixes=("", "_identified_addresses")
|
|
)
|
|
|
|
if merged_data.shape[0] != starting_rows:
|
|
raise ValueError("Row numbers have changed")
|
|
|
|
merged_data = merged_data.merge(
|
|
identified_addresses.drop(columns="merge_key"),
|
|
how="left",
|
|
left_on="merge_key2",
|
|
right_on="merge_key2",
|
|
suffixes=("", "_identified_addresses2")
|
|
)
|
|
|
|
if merged_data.shape[0] != starting_rows:
|
|
raise ValueError("Row numbers have changed")
|
|
|
|
merged_data["identified"] = (
|
|
merged_data["Postcode_identified_addresses"].notnull() | merged_data["Postcode_identified_addresses2"].notnull()
|
|
)
|
|
|
|
# HA 32 issues:
|
|
# We have 36 Hesstle road addresses in the HA32 identified addresses, that are not in the original asset list
|
|
#
|
|
|
|
missed = identified_addresses[
|
|
~identified_addresses["merge_key"].isin(merged_data["merge_key"]) &
|
|
~identified_addresses["merge_key2"].isin(merged_data["merge_key2"])
|
|
]
|
|
|
|
if missed.shape[0] != 36:
|
|
raise ValueError("We have a problem here, investigate me, missings beyond the Hessle Road addresses")
|
|
|
|
# Finally, we return the data we need
|
|
|
|
return merged_data, dropped_identified_merge_keys
|
|
|
|
|
|
def merge_ha_15(asset_list, identified_addresses):
|
|
"""
|
|
This method merges the asset list onto the list of identified addresses, forming a singular file
|
|
"""
|
|
|
|
dropped_identified_merge_keys = []
|
|
|
|
# Update how Mary Mac Manus Drive, Milton Keynes is listed in the identified addresses
|
|
identified_addresses["Address"] = identified_addresses["Address"].str.replace(
|
|
"Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive"
|
|
)
|
|
|
|
# This address has the wrong postcode in the orignal asset list
|
|
asset_list["Postcode"] = np.where(
|
|
asset_list["Address Line 1"] == "103 Priory Crescent",
|
|
"HP19 9NY",
|
|
asset_list["Postcode"]
|
|
)
|
|
|
|
# ha32 starts with 1418 rows
|
|
starting_rows = len(asset_list)
|
|
|
|
asset_list["merge_key"] = (
|
|
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
).str.replace(',', '').str.replace('.', '')
|
|
|
|
asset_list["merge_key2"] = (
|
|
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Address Line 3"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
).str.replace(',', '').str.replace('.', '')
|
|
|
|
asset_list["merge_key3"] = (
|
|
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
).str.replace(',', '').str.replace('.', '')
|
|
|
|
asset_list["merge_key4"] = (
|
|
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Address Line 4"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
).str.replace(',', '').str.replace('.', '')
|
|
|
|
identified_addresses["merge_key"] = (
|
|
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
|
identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
|
).str.replace(',', '').str.replace('.', '')
|
|
|
|
# We check for duplicated identified addresses and in the asset list
|
|
|
|
identified_dupes = identified_addresses["merge_key"].duplicated()
|
|
if identified_dupes.sum():
|
|
logger.warning("We have %s duplicated identified addresses that will be dropped", identified_dupes.sum())
|
|
|
|
dropped_identified_merge_keys.extend(identified_addresses[identified_dupes]["merge_key"].tolist())
|
|
|
|
identified_addresses = identified_addresses.drop_duplicates("merge_key")
|
|
|
|
# We pull out raw counts for the survey lists
|
|
|
|
# Check asset list for dupes
|
|
asset_list_dupes = asset_list["merge_key"].duplicated()
|
|
if asset_list_dupes.sum():
|
|
logger.warning(
|
|
"We have some duplicated asset list rows - they won't be dropped but we make sure they aren't in the "
|
|
"identified addresses"
|
|
)
|
|
dupe_keys = asset_list[asset_list["merge_key"].duplicated()]["merge_key"].tolist()
|
|
|
|
check = identified_addresses[identified_addresses.merge_key.isin(dupe_keys)]
|
|
if not check.empty:
|
|
raise ValueError("We have a problem here, investigate me")
|
|
|
|
# Merge the asset list onto the identified addresses
|
|
merged_data = pd.merge(
|
|
asset_list,
|
|
identified_addresses,
|
|
how="left",
|
|
left_on="merge_key",
|
|
right_on="merge_key",
|
|
suffixes=("", "_identified_addresses")
|
|
)
|
|
|
|
if merged_data.shape[0] != starting_rows:
|
|
raise ValueError("Row numbers have changed")
|
|
|
|
# merge on the second merge key
|
|
merged_data = pd.merge(
|
|
merged_data,
|
|
identified_addresses,
|
|
how="left",
|
|
left_on="merge_key2",
|
|
right_on="merge_key",
|
|
suffixes=("", "_identified_addresses2")
|
|
)
|
|
|
|
if merged_data.shape[0] != starting_rows:
|
|
raise ValueError("Row numbers have changed")
|
|
|
|
# merge on the third merge key
|
|
merged_data = pd.merge(
|
|
merged_data,
|
|
identified_addresses,
|
|
how="left",
|
|
left_on="merge_key3",
|
|
right_on="merge_key",
|
|
suffixes=("", "_identified_addresses3")
|
|
)
|
|
|
|
if merged_data.shape[0] != starting_rows:
|
|
raise ValueError("Row numbers have changed")
|
|
|
|
# merge on the fourth merge key
|
|
merged_data = pd.merge(
|
|
merged_data,
|
|
identified_addresses,
|
|
how="left",
|
|
left_on="merge_key4",
|
|
right_on="merge_key",
|
|
suffixes=("", "_identified_addresses4")
|
|
)
|
|
|
|
if merged_data.shape[0] != starting_rows:
|
|
raise ValueError("Row numbers have changed")
|
|
|
|
merged_data["identified"] = (
|
|
merged_data["Postcode_identified_addresses"].notnull() |
|
|
merged_data["Postcode_identified_addresses2"].notnull() |
|
|
merged_data["Postcode_identified_addresses3"].notnull() |
|
|
merged_data["Postcode_identified_addresses4"].notnull()
|
|
)
|
|
|
|
# HA 32 issues:
|
|
# We have 36 Hesstle road addresses in the HA32 identified addresses, that are not in the original asset list
|
|
#
|
|
|
|
missed = identified_addresses[
|
|
~identified_addresses["merge_key"].isin(merged_data["merge_key"]) &
|
|
~identified_addresses["merge_key"].isin(merged_data["merge_key2"]) &
|
|
~identified_addresses["merge_key"].isin(merged_data["merge_key3"]) &
|
|
~identified_addresses["merge_key"].isin(merged_data["merge_key4"])
|
|
]
|
|
|
|
if missed.shape[0]:
|
|
raise ValueError("We have a problem here, investigate me, should not have any missings for ha15")
|
|
|
|
return merged_data, dropped_identified_merge_keys
|
|
|
|
|
|
def prepare_model_data_row(
|
|
property_id, modelling_epc, cleaned, cleaning_data, created_at,
|
|
photo_supply_lookup, floor_area_decile_thresholds, old_data=None, full_sap_epc=None,
|
|
):
|
|
"""
|
|
This function prepares the data for modelling, in the same fashion as the recommendation engine
|
|
With up-coming refactoring, this will change
|
|
:param modelling_epc:
|
|
:return:
|
|
"""
|
|
|
|
epc_records = {
|
|
'original_epc': modelling_epc.copy(),
|
|
'full_sap_epc': full_sap_epc.copy(),
|
|
'old_data': old_data.copy(),
|
|
}
|
|
|
|
prepared_epc = EPCRecord(
|
|
epc_records=epc_records,
|
|
run_mode="newdata",
|
|
cleaning_data=cleaning_data
|
|
)
|
|
|
|
p = Property(
|
|
id=property_id,
|
|
postcode=modelling_epc["postcode"],
|
|
address=modelling_epc["address1"],
|
|
epc_record=prepared_epc
|
|
)
|
|
|
|
p.get_components(
|
|
cleaned, photo_supply_lookup=photo_supply_lookup, floor_area_decile_thresholds=floor_area_decile_thresholds
|
|
)
|
|
|
|
p.create_base_difference_epc_record(cleaned_lookup=cleaned)
|
|
|
|
cavity_simulation = {
|
|
"recommendation_id": "-".join([property_id, "cavity"]),
|
|
"type": "cavity_wall_insulation",
|
|
"new_u_value": 0.35,
|
|
"parts": [{}]
|
|
}
|
|
|
|
loft_simulation = {
|
|
"recommendation_id": "-".join([property_id, "loft"]),
|
|
"type": "loft_insulation",
|
|
"new_u_value": 0.16,
|
|
"parts": [{"depth": 270}]
|
|
}
|
|
|
|
simulations = [
|
|
cavity_simulation,
|
|
loft_simulation
|
|
]
|
|
|
|
recommendation_record = p.base_difference_record.df.to_dict("records")[0].copy()
|
|
scoring_dict = p.create_recommendation_scoring_data(
|
|
property_id=p.id,
|
|
recommendation_record=recommendation_record,
|
|
recommendations=simulations,
|
|
primary_recommendation_id=cavity_simulation["recommendation_id"]
|
|
)
|
|
|
|
return [scoring_dict]
|
|
|
|
|
|
def get_ha_32data(ha_data, cleaned, cleaning_data, created_at):
|
|
house_number_key = "Dwelling num"
|
|
address_key = "Street"
|
|
postcode_key = "Postcode"
|
|
house_name = "Dwelling name"
|
|
house_type_key = "Dwelling type"
|
|
|
|
house_type_lookup = {
|
|
"Bungalow": "Bungalow",
|
|
"Flat": "Flat",
|
|
'House': "House",
|
|
'Store Room': None,
|
|
'Bungalow Disabled': "Bungalow",
|
|
'Flat Disabled': "Flat",
|
|
'Dormer Bungalow': "Bungalow",
|
|
'Pop-In': None,
|
|
'Laundry': None,
|
|
'Shed': None,
|
|
'Bedsit': None,
|
|
}
|
|
|
|
scoring_data = []
|
|
results = []
|
|
no_house_numbers = []
|
|
for _, house in tqdm(ha_data.iterrows(), total=len(ha_data)):
|
|
|
|
# If we don't have a house number, we'll continue since we won't realistically be able to find
|
|
# an address
|
|
if pd.isnull(house[house_number_key]):
|
|
no_house_numbers.append(house["row_id"])
|
|
continue
|
|
|
|
if house_name is not None:
|
|
if not pd.isnull(house[house_name]):
|
|
address1 = " ".join([house[house_name], house[house_number_key], house[address_key]])
|
|
else:
|
|
address1 = " ".join([house[house_number_key], house[address_key]])
|
|
else:
|
|
address1 = " ".join([house[house_number_key], house[address_key]])
|
|
|
|
searcher = SearchEpc(
|
|
address1=address1,
|
|
postcode=house[postcode_key]
|
|
)
|
|
|
|
response = searcher.search()
|
|
if response["status"] == 204:
|
|
# If the property is identified, we should fix this
|
|
# if house["identified"]:
|
|
# raise NotImplementedError("Check if we have an epc")
|
|
results.append(
|
|
{
|
|
"row_id": house["row_id"],
|
|
"warmfront_identified": house["identified"],
|
|
"gbis_eligible": None,
|
|
"eco4_eligible": None,
|
|
"sap": None,
|
|
"roof": None,
|
|
"walls": None,
|
|
"date_epc": None,
|
|
"message": "No EPC found",
|
|
"gbis_eligible_future": None,
|
|
"gbis_eligible_future_message": None,
|
|
"eco4_eligible_future": None,
|
|
"eco4_eligible_future_message": None,
|
|
"tenure": None,
|
|
"heating_description": None,
|
|
}
|
|
)
|
|
continue
|
|
|
|
newest_epc, older_epcs, _ = searcher.retrieve(
|
|
property_type=house_type_lookup.get(house[house_type_key], None)
|
|
)
|
|
# We also want to get the penultimate epc
|
|
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
|
|
if not penultimate_epc:
|
|
penultimate_epc = newest_epc
|
|
|
|
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
|
|
eligibility.check_gbis_warmfront()
|
|
eligibility.check_eco4_warmfront()
|
|
|
|
# If there is no eligibility, we need to check the penultimate epc
|
|
# However, we only check the penultimate epc if the property is identified
|
|
# This is because if the property was identified, it's possible that the newest EPC is a post-retrofit
|
|
# EPC, which would mean that the penultimate EPC is the pre-retrofit EPC
|
|
# However, if the property HAS been identified, we don't want to check the penultimate EPC since
|
|
# The newest EPC will reflect the current state of the home and therefore we determine if there is a new
|
|
# opportunity for retrofit
|
|
if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (house["identified"]):
|
|
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
|
|
eligibility.check_gbis_warmfront()
|
|
eligibility.check_eco4_warmfront()
|
|
|
|
# If the house is not identified, we do a full gbis and eco4 check
|
|
# TODO: Add in ECO4 check
|
|
eligibility.check_gbis()
|
|
eligibility.check_eco4()
|
|
|
|
if eligibility.eco4_warmfront["eligible"]:
|
|
scoring_dictionary = prepare_model_data_row(
|
|
property_id=house["row_id"],
|
|
modelling_epc=eligibility.epc,
|
|
cleaned=cleaned,
|
|
cleaning_data=cleaning_data,
|
|
created_at=created_at
|
|
)
|
|
scoring_data.append(scoring_dictionary)
|
|
results.append(
|
|
{
|
|
"row_id": house["row_id"],
|
|
"warmfront_identified": house["identified"],
|
|
"gbis_eligible": eligibility.gbis_warmfront,
|
|
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
|
|
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
|
"roof": eligibility.roof["clean_description"],
|
|
"walls": eligibility.walls["clean_description"],
|
|
"date_epc": eligibility.epc["lodgement-date"],
|
|
"message": "eco4 conditional on post sap",
|
|
"gbis_eligible_future": eligibility.gbis["eligible"],
|
|
"gbis_eligible_future_message": eligibility.gbis["message"],
|
|
"eco4_eligible_future": eligibility.eco4["eligible"],
|
|
"eco4_eligible_future_message": eligibility.eco4["message"],
|
|
"tenure": eligibility.tenure,
|
|
"heating_description": eligibility.epc["mainheat-description"],
|
|
}
|
|
)
|
|
continue
|
|
|
|
# If nothing is eligible or gbis is eligible, then we make a record this
|
|
results.append(
|
|
{
|
|
"row_id": house["row_id"],
|
|
"warmfront_identified": house["identified"],
|
|
"gbis_eligible": eligibility.gbis_warmfront,
|
|
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
|
|
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
|
"roof": eligibility.roof["clean_description"],
|
|
"walls": eligibility.walls["clean_description"],
|
|
"date_epc": eligibility.epc["lodgement-date"],
|
|
"message": None,
|
|
"gbis_eligible_future": eligibility.gbis["eligible"],
|
|
"gbis_eligible_future_message": eligibility.gbis["message"],
|
|
"eco4_eligible_future": eligibility.eco4["eligible"],
|
|
"eco4_eligible_future_message": eligibility.eco4["message"],
|
|
"tenure": eligibility.tenure,
|
|
"heating_description": eligibility.epc["mainheat-description"],
|
|
}
|
|
)
|
|
|
|
return results, scoring_data, no_house_numbers
|
|
|
|
|
|
def get_ha_15data(ha_data, cleaned, cleaning_data, created_at):
|
|
house_number_key = None
|
|
address_key = "Address Line 1"
|
|
postcode_key = "Postcode"
|
|
house_name = None
|
|
house_type_key = "Property Type"
|
|
|
|
house_type_lookup = {
|
|
"Bungalow": "Bungalow",
|
|
"Flat": "Flat",
|
|
'House': "House",
|
|
'Flat over garage': "Flat",
|
|
'Maisonette': "Maisonette",
|
|
}
|
|
|
|
scoring_data = []
|
|
results = []
|
|
no_house_numbers = []
|
|
for _, house in tqdm(ha_data.iterrows(), total=len(ha_data)):
|
|
|
|
# If we don't have a house number, we'll continue since we won't realistically be able to find
|
|
# an address
|
|
if house_number_key is not None:
|
|
if pd.isnull(house[house_number_key]):
|
|
no_house_numbers.append(house["row_id"])
|
|
continue
|
|
|
|
if house_name is not None:
|
|
if not pd.isnull(house[house_name]):
|
|
address1 = " ".join([house[house_name], house[house_number_key], house[address_key]])
|
|
else:
|
|
address1 = " ".join([house[house_number_key], house[address_key]])
|
|
else:
|
|
address1 = house[address_key]
|
|
|
|
searcher = SearchEpc(
|
|
address1=address1,
|
|
postcode=house[postcode_key]
|
|
)
|
|
|
|
response = searcher.search()
|
|
if response["status"] == 204:
|
|
# If the property is identified, we should fix this
|
|
# if house["identified"]:
|
|
# raise NotImplementedError("Check if we have an epc")
|
|
results.append(
|
|
{
|
|
"row_id": house["row_id"],
|
|
"warmfront_identified": house["identified"],
|
|
"gbis_eligible": None,
|
|
"eco4_eligible": None,
|
|
"sap": None,
|
|
"roof": None,
|
|
"walls": None,
|
|
"date_epc": None,
|
|
"message": "No EPC found",
|
|
"eco4_eligible_future": None,
|
|
"eco4_eligible_future_message": None,
|
|
"tenure": None,
|
|
"heating_description": None,
|
|
}
|
|
)
|
|
continue
|
|
|
|
newest_epc, older_epcs, _ = searcher.retrieve(
|
|
property_type=house_type_lookup.get(house[house_type_key], None)
|
|
)
|
|
# We also want to get the penultimate epc
|
|
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
|
|
if not penultimate_epc:
|
|
penultimate_epc = newest_epc
|
|
|
|
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
|
|
eligibility.check_gbis_warmfront()
|
|
eligibility.check_eco4_warmfront()
|
|
|
|
# If there is no eligibility, we need to check the penultimate epc
|
|
# However, we only check the penultimate epc if the property is identified
|
|
# This is because if the property was identified, it's possible that the newest EPC is a post-retrofit
|
|
# EPC, which would mean that the penultimate EPC is the pre-retrofit EPC
|
|
# However, if the property HAS been identified, we don't want to check the penultimate EPC since
|
|
# The newest EPC will reflect the current state of the home and therefore we determine if there is a new
|
|
# opportunity for retrofit
|
|
if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront) and (house["identified"]):
|
|
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
|
|
eligibility.check_gbis_warmfront()
|
|
eligibility.check_eco4_warmfront()
|
|
|
|
# If the house is not identified, we do a full gbis and eco4 check
|
|
# TODO: Add in ECO4 check
|
|
eligibility.check_gbis()
|
|
eligibility.check_eco4()
|
|
|
|
if eligibility.eco4_warmfront["eligible"]:
|
|
scoring_dictionary = prepare_model_data_row(
|
|
property_id=house["row_id"],
|
|
modelling_epc=eligibility.epc,
|
|
cleaned=cleaned,
|
|
cleaning_data=cleaning_data,
|
|
created_at=created_at
|
|
)
|
|
scoring_data.append(scoring_dictionary)
|
|
results.append(
|
|
{
|
|
"row_id": house["row_id"],
|
|
"warmfront_identified": house["identified"],
|
|
"gbis_eligible": eligibility.gbis_warmfront,
|
|
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
|
|
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
|
"roof": eligibility.roof["clean_description"],
|
|
"walls": eligibility.walls["clean_description"],
|
|
"date_epc": eligibility.epc["lodgement-date"],
|
|
"message": "eco4 conditional on post sap",
|
|
"gbis_eligible_future": eligibility.gbis["eligible"],
|
|
"gbis_eligible_future_message": eligibility.gbis["message"],
|
|
"eco4_eligible_future": eligibility.eco4["eligible"],
|
|
"eco4_eligible_future_message": eligibility.eco4["message"],
|
|
"tenure": eligibility.tenure,
|
|
"heating_description": eligibility.epc["mainheat-description"],
|
|
}
|
|
)
|
|
continue
|
|
|
|
# If nothing is eligible or gbis is eligible, then we make a record this
|
|
results.append(
|
|
{
|
|
"row_id": house["row_id"],
|
|
"warmfront_identified": house["identified"],
|
|
"gbis_eligible": eligibility.gbis_warmfront,
|
|
"eco4_eligible": eligibility.eco4_warmfront["eligible"],
|
|
"sap": float(eligibility.epc["current-energy-efficiency"]),
|
|
"roof": eligibility.roof["clean_description"],
|
|
"walls": eligibility.walls["clean_description"],
|
|
"date_epc": eligibility.epc["lodgement-date"],
|
|
"message": None,
|
|
"gbis_eligible_future": eligibility.gbis["eligible"],
|
|
"gbis_eligible_future_message": eligibility.gbis["message"],
|
|
"eco4_eligible_future": eligibility.eco4["eligible"],
|
|
"eco4_eligible_future_message": eligibility.eco4["message"],
|
|
"tenure": eligibility.tenure,
|
|
"heating_description": eligibility.epc["mainheat-description"],
|
|
}
|
|
)
|
|
|
|
# with open("ha_15_outputs.pickle", "rb") as f:
|
|
# results_dict = pickle.load(f)
|
|
# results = results_dict["results"]
|
|
# scoring_data = results_dict["scoring_data"]
|
|
# no_house_numbers = results_dict["no_house_numbers"]
|
|
|
|
scoring_df = pd.DataFrame(scoring_data)
|
|
# Implement the same process that is being used in the recommendation engine to cleaning scoring_df
|
|
|
|
# Perform the same cleaning as in the model - first clean number of room variables though
|
|
scoring_df = DataProcessor.apply_averages_cleaning(
|
|
data_to_clean=scoring_df,
|
|
cleaning_data=cleaning_data,
|
|
cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
|
|
colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
|
)
|
|
|
|
scoring_df = DataProcessor.apply_averages_cleaning(
|
|
data_to_clean=scoring_df,
|
|
cleaning_data=cleaning_data,
|
|
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"],
|
|
).drop(columns=["LOCAL_AUTHORITY"])
|
|
|
|
scoring_df = DataProcessor.clean_missings_after_description_process(
|
|
scoring_df,
|
|
ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or (
|
|
"insulation_thickness" in c) or ("ENERGY_EFF" in c)]
|
|
)
|
|
|
|
scoring_df = DataProcessor.clean_efficiency_variables(scoring_df)
|
|
|
|
model_api = ModelApi(portfolio_id="ha32-eligibility", timestamp=created_at)
|
|
all_predictions = model_api.predict_all(
|
|
df=scoring_df,
|
|
bucket="retrofit-data-dev",
|
|
prediction_buckets={
|
|
"sap_change_predictions": "retrofit-sap-predictions-dev",
|
|
"heat_demand_predictions": "retrofit-heat-predictions-dev",
|
|
"carbon_change_predictions": "retrofit-carbon-predictions-dev"
|
|
}
|
|
)
|
|
|
|
# merge the predictions onto the scoring_df
|
|
predictions = all_predictions["sap_change_predictions"]
|
|
|
|
results_df = pd.DataFrame(results)
|
|
|
|
results_df = results_df.merge(
|
|
predictions[["predictions", "property_id"]].rename(
|
|
columns={"predictions": "post_install_sap", "property_id": "row_id"}
|
|
),
|
|
how="left",
|
|
on="row_id"
|
|
)
|
|
|
|
# Our methodology for identifying properties is to use the post-install SAP score
|
|
# We produce the following classifications, which accomodate the fact that the model can be wrong
|
|
# 1) If the post-install SAP score is above 71, we say the property is eligible and we hve high confidence
|
|
# 2) If the post-install SAP score is above 69, we say that the property is eligible
|
|
# 3) If the post-install SAP score is above 67, we say that the property is eligible, but we are not confident
|
|
# 4) If the post-install SAP score is below 67, we say that the property is unlikely to be eligible
|
|
|
|
eligibility_assessment = []
|
|
for _, row in results_df[results_df["eco4_eligible"] == True].iterrows():
|
|
# The upgrade requirements are dependent on the current SAP
|
|
|
|
# If the property is an F or G, it only needs to upgrade to an %
|
|
if row["sap"] <= 38:
|
|
if row["post_install_sap"] >= 57:
|
|
eligibility_classification = "highest confidence"
|
|
elif row["post_install_sap"] >= 55:
|
|
eligibility_classification = "high confidence"
|
|
elif row["post_install_sap"] >= 53:
|
|
eligibility_classification = "medium confidence"
|
|
else:
|
|
eligibility_classification = "unlikely"
|
|
else:
|
|
|
|
if row["post_install_sap"] >= 71:
|
|
eligibility_classification = "highest confidence"
|
|
elif row["post_install_sap"] >= 69:
|
|
eligibility_classification = "high confidence"
|
|
elif row["post_install_sap"] >= 67:
|
|
eligibility_classification = "medium confidence"
|
|
else:
|
|
eligibility_classification = "unlikely"
|
|
|
|
eligibility_assessment.append(
|
|
{
|
|
"row_id": row["row_id"],
|
|
"eligibility_classification": eligibility_classification
|
|
}
|
|
)
|
|
|
|
eligibility_assessment = pd.DataFrame(eligibility_assessment)
|
|
|
|
results_df = results_df.merge(
|
|
eligibility_assessment, how="left", on="row_id"
|
|
)
|
|
|
|
return results_df, scoring_df, no_house_numbers
|
|
|
|
|
|
def analyse_ha_32_results(results, ha32, no_house_numbers):
|
|
"""
|
|
We want to know:
|
|
1) What proportion of identified properties we get correct
|
|
2) If we miss identified properties, why
|
|
3) Which properties do we identify that were not identified by warmfront. What is our confidence on these?
|
|
|
|
For HA32, most of these (if not all) properties were identified under gbis
|
|
"""
|
|
|
|
results_df = pd.DataFrame(results)
|
|
results_df["tenure"] = results_df["tenure"].fillna("Unknown - probably new build")
|
|
|
|
# What proportio
|
|
warmfront_identified = results_df[
|
|
results_df["warmfront_identified"]
|
|
]
|
|
|
|
# Aggregates of no eco and gbis jobs identified
|
|
n_eco = results_df["eco4_eligible"].sum()
|
|
# Gbis is rows where eco4 is not eligible
|
|
n_gbis = results_df[
|
|
(results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False)
|
|
]["gbis_eligible"].sum()
|
|
|
|
pipeline_potential = results_df[
|
|
(results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | (
|
|
results_df["gbis_eligible"] == True)
|
|
]
|
|
|
|
success_rate = warmfront_identified["gbis_eligible"].sum() / warmfront_identified.shape[0]
|
|
# For HA32, this is 89%
|
|
|
|
missed = results_df[
|
|
results_df["warmfront_identified"] & (warmfront_identified["gbis_eligible"] != True)
|
|
]
|
|
|
|
sap_too_high = missed[
|
|
missed["sap"] >= 69
|
|
]
|
|
|
|
sap_low_enough = missed[
|
|
missed["sap"] < 69
|
|
]
|
|
|
|
investigate_1 = ha32[ha32["row_id"].isin(sap_too_high["row_id"])][
|
|
["row_id", "Postcode", "Address", "Dwelling num", "Street"]]
|
|
|
|
investigate_2 = ha32[ha32["row_id"].isin(sap_low_enough["row_id"])][
|
|
["row_id", "Postcode", "Address", "Dwelling num", "Street"]]
|
|
|
|
# to_check = missed[pd.isnull(missed["message"])]
|
|
|
|
# ha32[ha32["row_id"] == to_check["row_id"].values[14]].squeeze()
|
|
# to_check[to_check["row_id"] == to_check["row_id"].values[14]].squeeze()
|
|
|
|
# For these properties, warmfront identified all of them, however two did not seem to look valid.
|
|
# We could perhaps update our detection, if the properties not found are not currently EPC C or above, but
|
|
# do not look eligible from a building materials perspective
|
|
# E.g.:
|
|
# row_ids = ha32[ha32["Postcode"] == "HU4 6TG"]["row_id"].values
|
|
# z = results_df[results_df["row_id"].isin(row_ids)]
|
|
|
|
# Reason 1: The EPC indicates that the cavity is filled (GBIS allows for more than just cavity measures, however
|
|
# we check ust the cavity for GBIS homes, since I believe this is what Warmfront have in place with
|
|
# regards to commercial agreements with the installer. An example of this is 30 Coxwold Grove,
|
|
# HU4 6HH.
|
|
#
|
|
# Reason 2: Some properties do not have any existing data. This amounts for 16 of the 50 that we missed.
|
|
# We will be implemntating a solution to interpolate homes that do not have any data, based on their
|
|
# neighbours. An example of this is 979 Hessle Road, HU4 6QG. If we look at the neighbours, we would
|
|
# likely infer that this property has an empty cavity and therefore would identify
|
|
#
|
|
# Reason 3: Some properties, e.g. 975 Hessle Road, HU4 6QG, look like they would quality for GBIS,
|
|
# but is already a C, based on its Nov 2022 EPC (it was a C before that too). I'm personally not sure
|
|
# why this home would get identified as you would not be able to get GBIS funding. Same for 977 Hessle
|
|
# road. This was the most common reason. Another example 8 Edith Cavell Court, HU5 4BA
|
|
#
|
|
# Reason 4: Some properties are a combination of reason 1 and 3. This could be to do with inaccurate EPCs as
|
|
# emperically speaking, when going through this manually, it seemed like the ones that fall into this
|
|
# category had slightly older EPCs (pre-2019). There are a few like this but e.g
|
|
# 3, Summergroves Way HU4 6SZ
|
|
|
|
# We now look for properties that we identified, that were not identified by Warmfront
|
|
|
|
new_possibilities = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["gbis_eligible"] | results_df["eco4_eligible"])
|
|
].copy()
|
|
|
|
new_possibilities_eco = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["eco4_eligible"] == True)
|
|
].copy()
|
|
new_possibilities_gbis = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["eco4_eligible"] == False) & (results_df["gbis_eligible"] == True)
|
|
].copy()
|
|
|
|
future_possibilities_eco = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
|
|
].copy()
|
|
|
|
future_possibilities_eco["eco4_eligible_future_message"].value_counts()
|
|
|
|
future_possibilities_gbis = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & (
|
|
~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
|
|
].copy()
|
|
|
|
future_possibilities_gbis["gbis_eligible_future_message"].value_counts()
|
|
|
|
# We deem that Any EPC that is produced in the last 3 years gives us good confidence
|
|
cutoff_date = datetime.now() - timedelta(days=3 * 365)
|
|
|
|
new_possibilities["high_confidence"] = pd.to_datetime(new_possibilities["date_epc"]) >= cutoff_date
|
|
|
|
future_possibilities_eco["high_confidence"] = pd.to_datetime(
|
|
future_possibilities_eco["date_epc"]) >= cutoff_date
|
|
|
|
# We do a quick check on properties that didn't have a house number:
|
|
no_house_numbers_ha32 = ha32[ha32["row_id"].isin(no_house_numbers)]["identified"].sum()
|
|
if no_house_numbers_ha32:
|
|
logger.error("We have some identified properties that have no house numbers - investigate me")
|
|
|
|
new = {
|
|
"n_new_possibilities": new_possibilities.shape[0],
|
|
"new_possibilities_confidence": new_possibilities["high_confidence"].value_counts(),
|
|
"future_possibilities_gbis": future_possibilities_gbis.shape[0],
|
|
"future_possibilities_gbis_confidence": future_possibilities_gbis["high_confidence"].value_counts(),
|
|
"future_possibilities_eco": future_possibilities_eco.shape[0],
|
|
"future_possibilities_eco_confidence": future_possibilities_eco["high_confidence"].value_counts(),
|
|
}
|
|
|
|
return success_rate, new
|
|
|
|
|
|
def analyse_ha_15_results(results_df, ha15, no_house_numbers):
|
|
"""
|
|
We want to know:
|
|
1) What proportion of identified properties we get correct
|
|
2) If we miss identified properties, why
|
|
3) Which properties do we identify that were not identified by warmfront. What is our confidence on these?
|
|
|
|
For HA32, most of these (if not all) properties were identified under gbis
|
|
"""
|
|
|
|
results_df["tenure"] = results_df["tenure"].fillna("Unknown - probably new build")
|
|
|
|
# What proportio
|
|
warmfront_identified = results_df[
|
|
results_df["warmfront_identified"]
|
|
]
|
|
|
|
warmfront_identified = warmfront_identified
|
|
|
|
n_identified = (warmfront_identified["gbis_eligible"] | warmfront_identified["eco4_eligible"]).sum()
|
|
|
|
success_rate = n_identified / warmfront_identified.shape[0]
|
|
|
|
eco_identified_confidence = warmfront_identified[warmfront_identified["eco4_eligible"] == True][
|
|
"eligibility_classification"].value_counts()
|
|
# For HA15 this is 50.3%
|
|
|
|
pipeline_potential = results_df[
|
|
(results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | (
|
|
results_df["gbis_eligible"] == True)
|
|
]
|
|
|
|
# of the properties we identify, what is the mix of confidenc
|
|
|
|
missed = results_df[
|
|
results_df["warmfront_identified"] & (
|
|
(warmfront_identified["gbis_eligible"] != True) & (warmfront_identified["eco4_eligible"] != True)
|
|
)
|
|
]
|
|
|
|
missed_no_data = missed[missed["message"] == "No EPC found"].shape[0]
|
|
|
|
sap_too_high = missed[
|
|
missed["sap"] >= 69
|
|
]
|
|
|
|
sap_low_enough = missed[
|
|
missed["sap"] < 69
|
|
]
|
|
|
|
# Aggregates of no eco and gbis jobs identified
|
|
n_eco = results_df["eco4_eligible"].sum()
|
|
# Gbis is rows where eco4 is not eligible
|
|
n_gbis = results_df[
|
|
(results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False)
|
|
]["gbis_eligible"].sum()
|
|
|
|
# We now look for properties that we identified, that were not identified by Warmfront
|
|
|
|
new_possibilities = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
((results_df["gbis_eligible"] == True) | (results_df["eco4_eligible"] == True))
|
|
].copy()
|
|
|
|
new_possibilities_eco = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["eco4_eligible"] == True)
|
|
].copy()
|
|
|
|
new_possibilities_gbis = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["eco4_eligible"] == False) & (results_df["gbis_eligible"] == True)
|
|
].copy()
|
|
|
|
# These are future possibilityies
|
|
future_possibilities_eco = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
|
|
].copy()
|
|
|
|
future_possibilities_gbis = results_df[
|
|
(~results_df["warmfront_identified"]) &
|
|
(results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & (
|
|
~(results_df["gbis_eligible"] | results_df["eco4_eligible"]))
|
|
].copy()
|
|
|
|
# We deem that Any EPC that is produced in the last 3 years gives us good confidence for GBIS
|
|
cutoff_date = datetime.now() - timedelta(days=3 * 365)
|
|
|
|
new_possibilities["high_confidence"] = pd.to_datetime(new_possibilities["date_epc"]) >= cutoff_date
|
|
|
|
eco_new_possibilities = new_possibilities["eco4_eligible"].sum()
|
|
eco_new_possibilities_confidence = new_possibilities[
|
|
new_possibilities["eco4_eligible"]
|
|
]["eligibility_classification"].value_counts()
|
|
|
|
gbis_new_possibilites = new_possibilities["gbis_eligible"].sum()
|
|
gbis_new_possibilites_confidence = new_possibilities[
|
|
new_possibilities["gbis_eligible"]
|
|
]["high_confidence"].value_counts()
|
|
|
|
new = {
|
|
"new_possibilities": new_possibilities,
|
|
"eco_new_possibilities": eco_new_possibilities,
|
|
"eco_new_possibilities_confidence": eco_new_possibilities_confidence,
|
|
"gbis_new_possibilites": gbis_new_possibilites,
|
|
"gbis_new_possibilites_confidence": gbis_new_possibilites_confidence
|
|
}
|
|
|
|
identified_results = {
|
|
"n_identified": n_identified,
|
|
"success_rate": success_rate,
|
|
"eco_identified_confidence": eco_identified_confidence
|
|
}
|
|
|
|
missed_results = {
|
|
"n_missed": missed.shape[0],
|
|
"n_sap_too_high": sap_too_high.shape[0],
|
|
"n_sap_low_enough": sap_low_enough.shape[0],
|
|
"missed_no_data": missed_no_data
|
|
}
|
|
|
|
return success_rate, new, identified_results, missed_results
|
|
|
|
|
|
def app():
|
|
ha32_asset_list, ha15_asset_list, ha32_identified_addresses, ha15_identified_addresses = load_data()
|
|
|
|
ha32, _ = marge_ha_32(asset_list=ha32_asset_list, identified_addresses=ha32_identified_addresses)
|
|
ha15, _ = merge_ha_15(asset_list=ha15_asset_list, identified_addresses=ha15_identified_addresses)
|
|
|
|
ha32["row_id"] = ["h32" + str(i) for i in range(0, len(ha32))]
|
|
ha15["row_id"] = ["h15" + str(i) for i in range(0, len(ha15))]
|
|
|
|
cleaned = read_from_s3(
|
|
s3_file_name="cleaned_epc_data/cleaned.bson",
|
|
bucket_name="retrofit-data-dev"
|
|
)
|
|
cleaned = msgpack.unpackb(cleaned, raw=False)
|
|
|
|
cleaning_data = read_dataframe_from_s3_parquet(
|
|
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
|
|
)
|
|
|
|
created_at = datetime.now().isoformat()
|
|
|
|
# We want to retrieve EPCs for every single property
|
|
# NOTE: HA32 is MOSTLY cavity via GBIS
|
|
|
|
ha32_results, ha32_scoring_data, ha32_no_house_numbers = get_ha_32data(
|
|
ha_data=ha32,
|
|
cleaned=cleaned,
|
|
cleaning_data=cleaning_data,
|
|
created_at=created_at
|
|
)
|
|
|
|
# with open("ha32.pickle", "wb") as f:
|
|
# pickle.dump(
|
|
# {
|
|
# "ha32_results": ha32_results,
|
|
# "ha32_scoring_data": ha32_scoring_data,
|
|
# "ha32_no_house_numbers": ha32_no_house_numbers
|
|
# },
|
|
# f
|
|
# )
|
|
|
|
# with open("ha32.pickle", "rb") as f:
|
|
# ha32_dict = pickle.load(f)
|
|
#
|
|
# ha32_results = ha32_dict["ha32_results"]
|
|
# ha32_scoring_data = ha32_dict["ha32_scoring_data"]
|
|
# ha32_no_house_numbers = ha32_dict["ha32_no_house_numbers"]
|
|
|
|
ha32_success_rate, ha32_new_possibilities = analyse_ha_32_results(
|
|
results=ha32_results, ha32=ha32, no_house_numbers=ha32_no_house_numbers
|
|
)
|
|
|
|
# HA 15
|
|
ha15_results_df, ha15_scoring_df, ha15_no_house_numbers = get_ha_15data(ha15, cleaned, cleaning_data, created_at)
|
|
|
|
# with open("ha15.pickle", "wb") as f:
|
|
# pickle.dump(
|
|
# {
|
|
# "ha15_results_df": ha15_results_df,
|
|
# "ha15_scoring_df": ha15_scoring_df,
|
|
# "ha15_no_house_numbers": ha15_no_house_numbers
|
|
# },
|
|
# f
|
|
# )
|
|
|
|
# with open("ha15.pickle", "rb") as f:
|
|
# ha15_dict = pickle.load(f)
|
|
#
|
|
# ha15_results_df = ha15_dict["ha15_results_df"]
|
|
# ha15_scoring_df = ha15_dict["ha15_scoring_df"]
|
|
# ha15_no_house_numbers = ha15_dict["ha15_no_house_numbers"]
|
|
|
|
ha15_success_rate, ha15_new, ha15_identified_results, ha15_missed_results = analyse_ha_15_results(
|
|
results_df=ha15_results_df,
|
|
ha15=ha15,
|
|
no_house_numbers=ha15_no_house_numbers
|
|
)
|