From eb216e55d39817a6d7bdd6c582c6da6826050ac9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 16:45:37 +0000 Subject: [PATCH] Handling missing dates in SearchEpc class --- backend/SearchEpc.py | 15 ++++++++++----- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 4a3f371a..3d2df9fb 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -30,7 +30,7 @@ vartypes = { 'environment-impact-potential': "Int64", 'glazed-type': 'str', 'heating-cost-current': 'float', - 'address3': 'str', + # 'address3': 'str', 'mainheatcont-description': 'str', 'sheating-energy-eff': 'str', 'property-type': 'str', @@ -40,7 +40,7 @@ vartypes = { 'mechanical-ventilation': 'str', 'hot-water-cost-current': 'str', 'county': 'str', - 'postcode': 'str', + # 'postcode': 'str', 'solar-water-heating-flag': 'str', 'constituency': 'str', 'co2-emissions-potential': 'float', @@ -55,7 +55,7 @@ vartypes = { # 'inspection-date': str, 'mains-gas-flag': 'str', 'co2-emiss-curr-per-floor-area': 'float', - 'address1': 'str', + # 'address1': 'str', 'heat-loss-corridor': 'str', 'flat-storey-count': "Int64", 'constituency-label': 'str', @@ -67,7 +67,7 @@ vartypes = { 'roof-description': 'str', 'floor-energy-eff': 'str', 'number-habitable-rooms': 'float', - 'address2': 'str', + # 'address2': 'str', 'hot-water-env-eff': 'str', 'posttown': 'str', 'mainheatc-energy-eff': 'str', @@ -98,7 +98,7 @@ vartypes = { # 'lodgement-date', 'extension-count': "Int64", 'mainheatc-env-eff': 'str', - 'lmk-key': 'str', + # 'lmk-key': 'str', 'wind-turbine-count': "Int64", 'tenure': 'str', 'floor-level': 'str', @@ -575,6 +575,11 @@ class SearchEpc: property_type=property_type ) + # If we have missing lodgment date, we fill it with inspection-date + epc_data["lodgement-datetime"] = epc_data["lodgement-datetime"].fillna(epc_data["inspection-date"]) + # If we still have missing dates, we set it to the mean of the non NA dates + epc_data["lodgement-datetime"] = epc_data["lodgement-datetime"].fillna(epc_data["lodgement-datetime"].mean()) + # For each attribute, we need to determine the datatype and use an appropriate method # to estimate. estimated_epc = {} diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 2fb26e73..a8f0bfa9 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1135,6 +1135,7 @@ def get_epc_data( scoring_data = [] nodata = [] failed_model_rows = [] + # Failed at index 13691 for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): if property_meta["matching_postcode"] is None: