mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
156 lines
6.2 KiB
Python
156 lines
6.2 KiB
Python
from pathlib import Path
|
|
from random import choices, sample
|
|
|
|
import os
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
from dotenv import load_dotenv
|
|
from utils.logger import setup_logger
|
|
from backend.SearchEpc import SearchEpc, vartypes
|
|
from BaseUtility import Definitions
|
|
from etl.epc.settings import BUILT_FORM_REMAP
|
|
|
|
ENV_FILE = Path(__file__).parent / "backend" / ".env"
|
|
|
|
logger = setup_logger()
|
|
|
|
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
|
DIR_SAMPLE_SIZE = 500
|
|
N_DIRECTORIES = 50
|
|
|
|
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
|
|
|
load_dotenv(ENV_FILE)
|
|
|
|
CATETORICALS_TO_IGNORE = [
|
|
"postcode", "constituency", "local-authority", "built-form", "property-type", "address1", "constituency-label",
|
|
"building-reference-number", "address2", "posttown", "transaction-type", "lmk-key", "address3",
|
|
"local-authority-label", "county",
|
|
]
|
|
|
|
|
|
def check_numeric_performance(estimated_value, actual_value):
|
|
# If we don't have anything to compare against, return None
|
|
if pd.isnull(actual_value):
|
|
return None
|
|
|
|
if pd.isnull(estimated_value):
|
|
return 1
|
|
|
|
if actual_value == 0 and estimated_value == 0:
|
|
return 0
|
|
|
|
if actual_value == 0 and estimated_value != 0:
|
|
return 1
|
|
|
|
return abs(estimated_value - actual_value) / actual_value
|
|
|
|
|
|
def app():
|
|
"""
|
|
This script is used to test the EPC estimation process.
|
|
"""
|
|
|
|
numerical_vartypes = {key: value for key, value in vartypes.items() if value in ["float", "Int64"]}
|
|
str_var_types = {key: value for key, value in vartypes.items() if value == "str"}
|
|
# Make sure we have missed any keys
|
|
if len(numerical_vartypes) + len(str_var_types) != len(vartypes):
|
|
raise ValueError("Not all vartypes have been accounted for")
|
|
|
|
# Drop some keys that aren't important
|
|
for k in CATETORICALS_TO_IGNORE:
|
|
str_var_types.pop(k, None)
|
|
|
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
|
|
directory_sample = choices(directories, k=N_DIRECTORIES)
|
|
|
|
results = []
|
|
|
|
for directory in tqdm(directory_sample):
|
|
filepath = directory / "certificates.csv"
|
|
df = pd.read_csv(filepath, low_memory=False)
|
|
df["UPRN"] = df["UPRN"].astype("Int64").astype("str")
|
|
df = df[~pd.isnull(df["UPRN"])]
|
|
|
|
uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
|
|
df_sample = df[df["UPRN"].isin(uprn_sample)]
|
|
# Take the record with the newest LODGEMENT_DATETIME by uprn
|
|
df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
|
|
# Convert the columns to lower case and replace underscores with hyphens, the same as the api
|
|
df_sample.columns = df_sample.columns.str.lower().str.replace("_", "-")
|
|
|
|
# For each epc, we test the estimation process
|
|
for _, epc in df_sample.iterrows():
|
|
epc = epc.to_dict()
|
|
address1 = epc["address1"]
|
|
postcode = epc["postcode"]
|
|
|
|
# Get all EPCs for this urpn and we make sure they get dropped from the estimate_epc function
|
|
epcs_for_uprn = df[df["UPRN"] == epc["uprn"]]
|
|
lmks_to_drop = epcs_for_uprn["LMK_KEY"].tolist()
|
|
searcher = SearchEpc(address1, postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="")
|
|
searcher.uprn = epc["uprn"]
|
|
|
|
# Perform the same remapping for built-form as in the Property class for this test, in case we get (e.g.)
|
|
# Enclosed End-Terrace
|
|
built_form = BUILT_FORM_REMAP.get(epc["built-form"], epc["built-form"])
|
|
if ((epc["property-type"] == "Maisonette") & (built_form == "Detached")) or (
|
|
built_form in Definitions.DATA_ANOMALY_MATCHES
|
|
):
|
|
built_form = ""
|
|
|
|
estimated_epc = searcher.estimate_epc(
|
|
property_type=epc["property-type"], built_form=built_form, lmks_to_drop=lmks_to_drop
|
|
)
|
|
|
|
# We now compare the difference between the estimated and original
|
|
# TODO: We can convert windows and lighting to numeric versions and estimate how close we are
|
|
numeric_performance = {
|
|
key: check_numeric_performance(estimated_epc[key], epc[key]) for key, value in
|
|
numerical_vartypes.items()
|
|
}
|
|
|
|
# Remove Nones
|
|
numeric_performance = {key: value for key, value in numeric_performance.items() if value is not None}
|
|
# Get an average
|
|
numeric_performance = sum(numeric_performance.values()) / len(numeric_performance)
|
|
numeric_success = 1 - numeric_performance
|
|
|
|
# categorical performance
|
|
categorical_performance = {
|
|
key: 0 if estimated_epc[key] != epc[key] else 1 for key, value in str_var_types.items()
|
|
}
|
|
# Get an average
|
|
categorical_success = sum(categorical_performance.values()) / len(categorical_performance)
|
|
|
|
results.append(
|
|
{
|
|
"uprn": epc["uprn"],
|
|
"numeric_success": numeric_success,
|
|
"categorical_success": categorical_success,
|
|
"property_type": epc["property-type"],
|
|
"built_form": epc["built-form"],
|
|
"tenure": epc["tenure"],
|
|
}
|
|
)
|
|
|
|
# Get aggregate performance figures
|
|
results_df = pd.DataFrame(results)
|
|
results_df["tenure"] = results_df["tenure"].replace("Rented (social)", "rental (social)")
|
|
|
|
avg_numeric_succes = results_df["numeric_success"].median()
|
|
avg_categorical_sucess = results_df["categorical_success"].median()
|
|
|
|
# Group by tenure
|
|
by_tenure = results_df.groupby("tenure").agg(
|
|
{"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
|
|
)
|
|
# By property type - we also want to see how many properties we have for each property type
|
|
by_property_type = results_df.groupby("property_type").agg(
|
|
{"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
|
|
)
|
|
# By property_type & built form
|
|
by_property_type_built_form = results_df.groupby(["property_type", "built_form"]).agg(
|
|
{"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
|
|
)
|