from pathlib import Path from random import choices, sample import os import pandas as pd from tqdm import tqdm from dotenv import load_dotenv from utils.logger import setup_logger from backend.SearchEpc import SearchEpc, vartypes from BaseUtility import Definitions from etl.epc.settings import BUILT_FORM_REMAP ENV_FILE = Path(__file__).parent / "backend" / ".env" logger = setup_logger() DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" DIR_SAMPLE_SIZE = 500 N_DIRECTORIES = 50 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") load_dotenv(ENV_FILE) CATETORICALS_TO_IGNORE = [ "postcode", "constituency", "local-authority", "built-form", "property-type", "address1", "constituency-label", "building-reference-number", "address2", "posttown", "transaction-type", "lmk-key", "address3", "local-authority-label", "county", ] def check_numeric_performance(estimated_value, actual_value): # If we don't have anything to compare against, return None if pd.isnull(actual_value): return None if pd.isnull(estimated_value): return 1 if actual_value == 0 and estimated_value == 0: return 0 if actual_value == 0 and estimated_value != 0: return 1 return abs(estimated_value - actual_value) / actual_value def app(): """ This script is used to test the EPC estimation process. """ numerical_vartypes = {key: value for key, value in vartypes.items() if value in ["float", "Int64"]} str_var_types = {key: value for key, value in vartypes.items() if value == "str"} # Make sure we have missed any keys if len(numerical_vartypes) + len(str_var_types) != len(vartypes): raise ValueError("Not all vartypes have been accounted for") # Drop some keys that aren't important for k in CATETORICALS_TO_IGNORE: str_var_types.pop(k, None) directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] directory_sample = choices(directories, k=N_DIRECTORIES) results = [] for directory in tqdm(directory_sample): filepath = directory / "certificates.csv" df = pd.read_csv(filepath, low_memory=False) df["UPRN"] = df["UPRN"].astype("Int64").astype("str") df = df[~pd.isnull(df["UPRN"])] # uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE) # Take a fixed sample based on the first DIR_SAMPLE_SIZE uprns uprn_sample = sorted(df["UPRN"].unique().tolist())[:DIR_SAMPLE_SIZE] df_sample = df[df["UPRN"].isin(uprn_sample)] # Take the record with the newest LODGEMENT_DATETIME by uprn df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN") # Convert the columns to lower case and replace underscores with hyphens, the same as the api df_sample.columns = df_sample.columns.str.lower().str.replace("_", "-") # For each epc, we test the estimation process for _, epc in df_sample.iterrows(): epc = epc.to_dict() address1 = epc["address1"] postcode = epc["postcode"] # Get all EPCs for this urpn and we make sure they get dropped from the estimate_epc function epcs_for_uprn = df[df["UPRN"] == epc["uprn"]] lmks_to_drop = epcs_for_uprn["LMK_KEY"].tolist() searcher = SearchEpc(address1, postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="") searcher.uprn = epc["uprn"] # Perform the same remapping for built-form as in the Property class for this test, in case we get (e.g.) # Enclosed End-Terrace built_form = BUILT_FORM_REMAP.get(epc["built-form"], epc["built-form"]) if ((epc["property-type"] == "Maisonette") & (built_form == "Detached")) or ( built_form in Definitions.DATA_ANOMALY_MATCHES ): built_form = "" estimated_epc = searcher.estimate_epc( property_type=epc["property-type"], built_form=built_form, lmks_to_drop=lmks_to_drop ) # We now compare the difference between the estimated and original # TODO: We can convert windows and lighting to numeric versions and estimate how close we are numeric_performance = { key: check_numeric_performance(estimated_epc[key], epc[key]) for key, value in numerical_vartypes.items() } # Remove Nones numeric_performance = {key: value for key, value in numeric_performance.items() if value is not None} # Get an average numeric_performance = sum(numeric_performance.values()) / len(numeric_performance) numeric_success = 1 - numeric_performance # categorical performance categorical_performance = { key: 0 if estimated_epc[key] != epc[key] else 1 for key, value in str_var_types.items() } # Get an average categorical_success = sum(categorical_performance.values()) / len(categorical_performance) results.append( { "uprn": epc["uprn"], "numeric_success": numeric_success, "categorical_success": categorical_success, "property_type": epc["property-type"], "built_form": epc["built-form"], "tenure": epc["tenure"], } ) # Get aggregate performance figures results_df = pd.DataFrame(results) results_df["tenure"] = results_df["tenure"].replace("Rented (social)", "rental (social)") avg_numeric_succes = results_df["numeric_success"].median() avg_categorical_sucess = results_df["categorical_success"].median() # With 20 nearest homes # 0.7718100840549558 # 0.5116279069767442 # 100 nearest homes # 0.7859617377809409 # 0.5348837209302325 # Fixed sample, sqrt weights # Group by tenure by_tenure = results_df.groupby("tenure").agg( {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} ) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # With 20 nearest homes # numeric_success categorical_success uprn # tenure # NO DATA! 0.847840 0.581395 278 # Not defined - use in the case of a new dwelling... 0.930282 0.651163 617 # Owner-occupied 0.770330 0.511628 2588 # Rented (private) 0.791885 0.558140 1232 # owner-occupied 0.741088 0.488372 10912 # rental (private) 0.749064 0.488372 3252 # rental (social) 0.822109 0.581395 3878 # unknown 0.895840 0.627907 1820 # 100 nearest homes # tenure # NO DATA! 0.899566 0.604651 233 # Not defined - use in the case of a new dwelling... 0.927518 0.674419 608 # Owner-occupied 0.777026 0.511628 3167 # Rented (private) 0.805646 0.534884 1316 # owner-occupied 0.762180 0.488372 10835 # rental (private) 0.760503 0.511628 3181 # rental (social) 0.830057 0.604651 3705 # unknown 0.899948 0.627907 1571 # By property type - we also want to see how many properties we have for each property type by_property_type = results_df.groupby("property_type").agg( {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} ) # By property_type & built form by_property_type_built_form = results_df.groupby(["property_type", "built_form"]).agg( {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} )