From 2156f6b0764cf8f52cb592187b097e1f019afa4c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 2 Jan 2024 18:23:52 +0000 Subject: [PATCH] building out aggregation --- backend/SearchEpc.py | 2 +- etl/testing_data/estimate_epc.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 07a7de23..f93fc5f0 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -437,6 +437,7 @@ class SearchEpc: if not epc_data.empty: # Further processing of the EPC data + epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime']) epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1) epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1)) epc_data["numeric_house_number"] = epc_data["house_number"].apply( @@ -505,7 +506,6 @@ class SearchEpc: built_form=built_form, property_type=property_type ) - epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime']) # For each attribute, we need to determine the datatype and use an appropriate method # to estimate. diff --git a/etl/testing_data/estimate_epc.py b/etl/testing_data/estimate_epc.py index dd919000..c72df6af 100644 --- a/etl/testing_data/estimate_epc.py +++ b/etl/testing_data/estimate_epc.py @@ -113,3 +113,22 @@ def app(): "tenure": epc["tenure"], } ) + + # Get aggregate performance figures + results_df = pd.DataFrame(results) + + avg_numeric_succes = results_df["numeric_success"].median() + avg_categorical_sucess = results_df["categorical_success"].median() + + # Group by tenure + by_tenure = results_df.groupby("tenure").agg( + {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} + ) + # By property type - we also want to see how many properties we have for each property type + by_property_type = results_df.groupby("property_type").agg( + {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} + ) + # By property_type & built form + by_property_type_built_form = results_df.groupby(["property_type", "built_form"]).agg( + {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} + )