diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index f93fc5f0..d7e1ae58 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -516,7 +516,12 @@ class SearchEpc: estimation_data = epc_data[[key, "weight", "lodgement-datetime"]].copy() estimation_data = estimation_data[~pd.isnull(estimation_data[key])] estimation_data = estimation_data[~estimation_data[key].isin(Definitions.DATA_ANOMALY_MATCHES)] - estimation_data[key] = estimation_data[key].astype(vartype) + if vartype == "Int64": + # We have some edge cases where we get the error "invalid literal for int() with base 10: '1.0'" + # so this handles this + estimation_data[key] = estimation_data[key].astype(float).astype(vartype) + else: + estimation_data[key] = estimation_data[key].astype(vartype) if estimation_data.shape[0] == 0: estimated_epc[key] = None diff --git a/etl/testing_data/estimate_epc.py b/etl/testing_data/estimate_epc.py index c72df6af..2203402a 100644 --- a/etl/testing_data/estimate_epc.py +++ b/etl/testing_data/estimate_epc.py @@ -7,6 +7,7 @@ from tqdm import tqdm from dotenv import load_dotenv from utils.logger import setup_logger from backend.SearchEpc import SearchEpc, vartypes +from etl.epc.settings import BUILT_FORM_REMAP ENV_FILE = Path(__file__).parent / "backend" / ".env" @@ -20,6 +21,12 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") load_dotenv(ENV_FILE) +CATETORICALS_TO_IGNORE = [ + "postcode", "constituency", "local-authority", "built-form", "property-type", "address1", "constituency-label", + "building-reference-number", "address2", "posttown", "transaction-type", "lmk-key", "address3", + "local-authority-label", "county", +] + def check_numeric_performance(estimated_value, actual_value): # If we don't have anything to compare against, return None @@ -49,6 +56,10 @@ def app(): if len(numerical_vartypes) + len(str_var_types) != len(vartypes): raise ValueError("Not all vartypes have been accounted for") + # Drop some keys that aren't important + for k in CATETORICALS_TO_IGNORE: + str_var_types.pop(k, None) + directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] directory_sample = choices(directories, k=N_DIRECTORIES) @@ -80,11 +91,16 @@ def app(): searcher = SearchEpc(address1, postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="") searcher.uprn = epc["uprn"] + # Perform the same remapping for built-form as in the Property class for this test, in case we get (e.g.) + # Enclosed End-Terrace + built_form = BUILT_FORM_REMAP.get(epc["built-form"], epc["built-form"]) + estimated_epc = searcher.estimate_epc( - property_type=epc["property-type"], built_form=epc["built-form"], lmks_to_drop=lmks_to_drop + property_type=epc["property-type"], built_form=built_form, lmks_to_drop=lmks_to_drop ) # We now compare the difference between the estimated and original + # TODO: We can convert windows and lighting to numeric versions and estimate how close we are numeric_performance = { key: check_numeric_performance(estimated_epc[key], epc[key]) for key, value in numerical_vartypes.items()