Model/etl/testing_data/estimate_epc.py

from pathlib import Path
from random import choices, sample

import os
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from utils.logger import setup_logger
from backend.SearchEpc import SearchEpc, vartypes
from BaseUtility import Definitions
from etl.epc.settings import BUILT_FORM_REMAP

ENV_FILE = Path(__file__).parent / "backend" / ".env"

logger = setup_logger()

DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
DIR_SAMPLE_SIZE = 500
N_DIRECTORIES = 50

EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")

load_dotenv(ENV_FILE)

CATETORICALS_TO_IGNORE = [
    "postcode", "constituency", "local-authority", "built-form", "property-type", "address1", "constituency-label",
    "building-reference-number", "address2", "posttown", "transaction-type", "lmk-key", "address3",
    "local-authority-label", "county",
]


def check_numeric_performance(estimated_value, actual_value):
    # If we don't have anything to compare against, return None
    if pd.isnull(actual_value):
        return None

    if pd.isnull(estimated_value):
        return 1

    if actual_value == 0 and estimated_value == 0:
        return 0

    if actual_value == 0 and estimated_value != 0:
        return 1

    return abs(estimated_value - actual_value) / actual_value


def app():
    """
    This script is used to test the EPC estimation process.
    """

    numerical_vartypes = {key: value for key, value in vartypes.items() if value in ["float", "Int64"]}
    str_var_types = {key: value for key, value in vartypes.items() if value == "str"}
    # Make sure we have missed any keys
    if len(numerical_vartypes) + len(str_var_types) != len(vartypes):
        raise ValueError("Not all vartypes have been accounted for")

    # Drop some keys that aren't important
    for k in CATETORICALS_TO_IGNORE:
        str_var_types.pop(k, None)

    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]

    directory_sample = choices(directories, k=N_DIRECTORIES)

    results = []

    for directory in tqdm(directory_sample):
        filepath = directory / "certificates.csv"
        df = pd.read_csv(filepath, low_memory=False)
        df["UPRN"] = df["UPRN"].astype("Int64").astype("str")
        df = df[~pd.isnull(df["UPRN"])]

        uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
        df_sample = df[df["UPRN"].isin(uprn_sample)]
        # Take the record with the newest LODGEMENT_DATETIME by uprn
        df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
        # Convert the columns to lower case and replace underscores with hyphens, the same as the api
        df_sample.columns = df_sample.columns.str.lower().str.replace("_", "-")

        # For each epc, we test the estimation process
        for _, epc in df_sample.iterrows():
            epc = epc.to_dict()
            address1 = epc["address1"]
            postcode = epc["postcode"]

            # Get all EPCs for this urpn and we make sure they get dropped from the estimate_epc function
            epcs_for_uprn = df[df["UPRN"] == epc["uprn"]]
            lmks_to_drop = epcs_for_uprn["LMK_KEY"].tolist()
            searcher = SearchEpc(address1, postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="")
            searcher.uprn = epc["uprn"]

            # Perform the same remapping for built-form as in the Property class for this test, in case we get (e.g.)
            # Enclosed End-Terrace
            built_form = BUILT_FORM_REMAP.get(epc["built-form"], epc["built-form"])
            if ((epc["property-type"] == "Maisonette") & (built_form == "Detached")) or (
                built_form in Definitions.DATA_ANOMALY_MATCHES
            ):
                built_form = ""

            estimated_epc = searcher.estimate_epc(
                property_type=epc["property-type"], built_form=built_form, lmks_to_drop=lmks_to_drop
            )

            # We now compare the difference between the estimated and original
            # TODO: We can convert windows and lighting to numeric versions and estimate how close we are
            numeric_performance = {
                key: check_numeric_performance(estimated_epc[key], epc[key]) for key, value in
                numerical_vartypes.items()
            }

            # Remove Nones
            numeric_performance = {key: value for key, value in numeric_performance.items() if value is not None}
            # Get an average
            numeric_performance = sum(numeric_performance.values()) / len(numeric_performance)
            numeric_success = 1 - numeric_performance

            # categorical performance
            categorical_performance = {
                key: 0 if estimated_epc[key] != epc[key] else 1 for key, value in str_var_types.items()
            }
            # Get an average
            categorical_success = sum(categorical_performance.values()) / len(categorical_performance)

            results.append(
                {
                    "uprn": epc["uprn"],
                    "numeric_success": numeric_success,
                    "categorical_success": categorical_success,
                    "property_type": epc["property-type"],
                    "built_form": epc["built-form"],
                    "tenure": epc["tenure"],
                }
            )

    # Get aggregate performance figures
    results_df = pd.DataFrame(results)
    results_df["tenure"] = results_df["tenure"].replace("Rented (social)", "rental (social)")

    avg_numeric_succes = results_df["numeric_success"].median()
    avg_categorical_sucess = results_df["categorical_success"].median()

    # Group by tenure
    by_tenure = results_df.groupby("tenure").agg(
        {"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
    )
    # By property type - we also want to see how many properties we have for each property type
    by_property_type = results_df.groupby("property_type").agg(
        {"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
    )
    # By property_type & built form
    by_property_type_built_form = results_df.groupby(["property_type", "built_form"]).agg(
        {"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
    )