From 81621eb1de1825ed80bc7469ad3827f48a8cb626 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 2 Jan 2024 18:05:29 +0000 Subject: [PATCH] working on epc error estimation method --- backend/SearchEpc.py | 19 +++++++++++++------ etl/testing_data/estimate_epc.py | 13 ++++++++++--- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 543fa6b8..6a0f3389 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -7,6 +7,7 @@ import pandas as pd import numpy as np from epc_api.client import EpcClient from backend.OrdnanceSurvey import OrdnanceSuveyClient +from BaseUtility import Definitions from utils.logger import setup_logger from typing import List from fuzzywuzzy import process @@ -441,10 +442,15 @@ class SearchEpc: epc_data["numeric_house_number"] = epc_data["house_number"].apply( lambda house_num: self.extract_numeric_housenumber_part(house_num) ) - epc_data["house_number_distance"] = abs( - epc_data["numeric_house_number"] - self.numeric_house_number - ) - epc_data["weight"] = 1 / epc_data["house_number_distance"] + + if self.numeric_house_number is None: + # If we don't have a house number, we treat all weights as equal + epc_data["weight"] = 1 + else: + epc_data["house_number_distance"] = abs( + epc_data["numeric_house_number"] - self.numeric_house_number + ) + epc_data["weight"] = 1 / epc_data["house_number_distance"] epc_built_form = self._estimate_str(key="built-form", estimation_data=epc_data) epc_property_type = self._estimate_str(key="property-type", estimation_data=epc_data) @@ -504,9 +510,10 @@ class SearchEpc: for key, vartype in vartypes.items(): epc_data[key] = np.where(pd.isnull(epc_data[key]), None, epc_data[key]) epc_data[key] = np.where(epc_data[key] == "", None, epc_data[key]) - epc_data[key] = epc_data[key].astype(vartype) - estimation_data = epc_data[[key, "weight", "lodgement-datetime"]] + estimation_data = epc_data[[key, "weight", "lodgement-datetime"]].copy() estimation_data = estimation_data[~pd.isnull(estimation_data[key])] + estimation_data = estimation_data[~estimation_data[key].isin(Definitions.DATA_ANOMALY_MATCHES)] + estimation_data[key] = estimation_data[key].astype(vartype) if estimation_data.shape[0] == 0: estimated_epc[key] = None diff --git a/etl/testing_data/estimate_epc.py b/etl/testing_data/estimate_epc.py index 8041e38d..dd919000 100644 --- a/etl/testing_data/estimate_epc.py +++ b/etl/testing_data/estimate_epc.py @@ -32,6 +32,9 @@ def check_numeric_performance(estimated_value, actual_value): if actual_value == 0 and estimated_value == 0: return 0 + if actual_value == 0 and estimated_value != 0: + return 1 + return abs(estimated_value - actual_value) / actual_value @@ -91,18 +94,22 @@ def app(): numeric_performance = {key: value for key, value in numeric_performance.items() if value is not None} # Get an average numeric_performance = sum(numeric_performance.values()) / len(numeric_performance) + numeric_success = 1 - numeric_performance # categorical performance categorical_performance = { key: 0 if estimated_epc[key] != epc[key] else 1 for key, value in str_var_types.items() } # Get an average - categorical_performance = sum(categorical_performance.values()) / len(categorical_performance) + categorical_success = sum(categorical_performance.values()) / len(categorical_performance) results.append( { "uprn": epc["uprn"], - "numeric_performance": numeric_performance, - "categorical_performance": categorical_performance + "numeric_success": numeric_success, + "categorical_success": categorical_success, + "property_type": epc["property-type"], + "built_form": epc["built-form"], + "tenure": epc["tenure"], } )