From de01b8d73c6672d39984056cb84c17b2dd9b0861 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 14 Aug 2023 20:24:56 +0000 Subject: [PATCH] Fixed bug with na values --- model_data/simulation_system/DataProcessor.py | 7 +++++-- model_data/simulation_system/app.py | 19 +++++++------------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/DataProcessor.py index 4b2202e8..2aa0fabe 100644 --- a/model_data/simulation_system/DataProcessor.py +++ b/model_data/simulation_system/DataProcessor.py @@ -27,6 +27,8 @@ class DataProcessor: """ self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory']) self.confine_data() + + # TODO: CLean number of heated rooms and habitable rooms self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings']) self.clean_multi_glaze_proportion() self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count']) @@ -39,10 +41,11 @@ class DataProcessor: # Define a custom function to calculate the median, excluding missing values def median_without_missing(group): return group[AVERAGE_FIXED_FEATURES].median(skipna=True) - + cleaning_averages = self.data.groupby( ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], - observed=True + observed=True, + dropna=False ).apply(median_without_missing).reset_index() general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply( diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index e1ab4c97..1037da14 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -42,8 +42,8 @@ def app(): # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} - # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row - if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: + # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row + if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: continue # Map all anomaly values to None @@ -70,7 +70,7 @@ def app(): columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"] - if any(modified_property_data[columns_to_merge_on].isna()): + if modified_property_data[columns_to_merge_on].isna().values.any(): # If there are any NA value, back fill first (i.e most recent), then forward fill if needed modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill') @@ -80,12 +80,14 @@ def app(): # Get the corresponding groupby and merge, and fill in NA values cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean() + modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE']) modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE']) modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE']) modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) for field in AVERAGE_FIXED_FEATURES: + vals = list(modified_property_data[field].dropna().unique()) if len(vals) > 1: # Check the values are too far apart @@ -93,11 +95,9 @@ def app(): if abs(vals[0] - vals[1]) / vals[0] > 0.1: # Take the more recent value since it's likely to be more accurate vals = [vals[-1]] + - if vals: - field_value = np.mean(vals) - - fixed_data[field] = field_value + fixed_data[field] = np.mean(vals) #Combine all fields together fixed_data.update(mandatory_field_data) @@ -122,11 +122,6 @@ def app(): rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE] heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE] - # TODO: Should this be <= 0? - if rdsap_change == 0: - # Assumption: We aren't interested in records that exhibit no change - continue - # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and # floors, we may want to use the U-value. We may also want to handle the (assumed) tags # within descriptions