Model/etl/testing_data/estimate_epc.py
2024-01-03 10:46:22 +00:00

156 lines
6.2 KiB
Python

from pathlib import Path
from random import choices, sample
import os
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from utils.logger import setup_logger
from backend.SearchEpc import SearchEpc, vartypes
from BaseUtility import Definitions
from etl.epc.settings import BUILT_FORM_REMAP
ENV_FILE = Path(__file__).parent / "backend" / ".env"
logger = setup_logger()
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
DIR_SAMPLE_SIZE = 500
N_DIRECTORIES = 50
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
load_dotenv(ENV_FILE)
CATETORICALS_TO_IGNORE = [
"postcode", "constituency", "local-authority", "built-form", "property-type", "address1", "constituency-label",
"building-reference-number", "address2", "posttown", "transaction-type", "lmk-key", "address3",
"local-authority-label", "county",
]
def check_numeric_performance(estimated_value, actual_value):
# If we don't have anything to compare against, return None
if pd.isnull(actual_value):
return None
if pd.isnull(estimated_value):
return 1
if actual_value == 0 and estimated_value == 0:
return 0
if actual_value == 0 and estimated_value != 0:
return 1
return abs(estimated_value - actual_value) / actual_value
def app():
"""
This script is used to test the EPC estimation process.
"""
numerical_vartypes = {key: value for key, value in vartypes.items() if value in ["float", "Int64"]}
str_var_types = {key: value for key, value in vartypes.items() if value == "str"}
# Make sure we have missed any keys
if len(numerical_vartypes) + len(str_var_types) != len(vartypes):
raise ValueError("Not all vartypes have been accounted for")
# Drop some keys that aren't important
for k in CATETORICALS_TO_IGNORE:
str_var_types.pop(k, None)
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
directory_sample = choices(directories, k=N_DIRECTORIES)
results = []
for directory in tqdm(directory_sample):
filepath = directory / "certificates.csv"
df = pd.read_csv(filepath, low_memory=False)
df["UPRN"] = df["UPRN"].astype("Int64").astype("str")
df = df[~pd.isnull(df["UPRN"])]
uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
df_sample = df[df["UPRN"].isin(uprn_sample)]
# Take the record with the newest LODGEMENT_DATETIME by uprn
df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
# Convert the columns to lower case and replace underscores with hyphens, the same as the api
df_sample.columns = df_sample.columns.str.lower().str.replace("_", "-")
# For each epc, we test the estimation process
for _, epc in df_sample.iterrows():
epc = epc.to_dict()
address1 = epc["address1"]
postcode = epc["postcode"]
# Get all EPCs for this urpn and we make sure they get dropped from the estimate_epc function
epcs_for_uprn = df[df["UPRN"] == epc["uprn"]]
lmks_to_drop = epcs_for_uprn["LMK_KEY"].tolist()
searcher = SearchEpc(address1, postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="")
searcher.uprn = epc["uprn"]
# Perform the same remapping for built-form as in the Property class for this test, in case we get (e.g.)
# Enclosed End-Terrace
built_form = BUILT_FORM_REMAP.get(epc["built-form"], epc["built-form"])
if ((epc["property-type"] == "Maisonette") & (built_form == "Detached")) or (
built_form in Definitions.DATA_ANOMALY_MATCHES
):
built_form = ""
estimated_epc = searcher.estimate_epc(
property_type=epc["property-type"], built_form=built_form, lmks_to_drop=lmks_to_drop
)
# We now compare the difference between the estimated and original
# TODO: We can convert windows and lighting to numeric versions and estimate how close we are
numeric_performance = {
key: check_numeric_performance(estimated_epc[key], epc[key]) for key, value in
numerical_vartypes.items()
}
# Remove Nones
numeric_performance = {key: value for key, value in numeric_performance.items() if value is not None}
# Get an average
numeric_performance = sum(numeric_performance.values()) / len(numeric_performance)
numeric_success = 1 - numeric_performance
# categorical performance
categorical_performance = {
key: 0 if estimated_epc[key] != epc[key] else 1 for key, value in str_var_types.items()
}
# Get an average
categorical_success = sum(categorical_performance.values()) / len(categorical_performance)
results.append(
{
"uprn": epc["uprn"],
"numeric_success": numeric_success,
"categorical_success": categorical_success,
"property_type": epc["property-type"],
"built_form": epc["built-form"],
"tenure": epc["tenure"],
}
)
# Get aggregate performance figures
results_df = pd.DataFrame(results)
results_df["tenure"] = results_df["tenure"].replace("Rented (social)", "rental (social)")
avg_numeric_succes = results_df["numeric_success"].median()
avg_categorical_sucess = results_df["categorical_success"].median()
# Group by tenure
by_tenure = results_df.groupby("tenure").agg(
{"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
)
# By property type - we also want to see how many properties we have for each property type
by_property_type = results_df.groupby("property_type").agg(
{"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
)
# By property_type & built form
by_property_type_built_form = results_df.groupby(["property_type", "built_form"]).agg(
{"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
)