From de01b8d73c6672d39984056cb84c17b2dd9b0861 Mon Sep 17 00:00:00 2001
From: Michael Duong <michael123ster@gmail.com>
Date: Mon, 14 Aug 2023 20:24:56 +0000
Subject: [PATCH] Fixed bug with na values

---
 model_data/simulation_system/DataProcessor.py |  7 +++++--
 model_data/simulation_system/app.py           | 19 +++++++------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/DataProcessor.py
index 4b2202e8..2aa0fabe 100644
--- a/model_data/simulation_system/DataProcessor.py
+++ b/model_data/simulation_system/DataProcessor.py
@@ -27,6 +27,8 @@ class DataProcessor:
         """
         self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
         self.confine_data()
+
+        # TODO: CLean number of heated rooms and habitable rooms 
         self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
         self.clean_multi_glaze_proportion()
         self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
@@ -39,10 +41,11 @@ class DataProcessor:
         # Define a custom function to calculate the median, excluding missing values
         def median_without_missing(group):
             return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
-
+    
         cleaning_averages = self.data.groupby(
             ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
-            observed=True
+            observed=True,
+            dropna=False
         ).apply(median_without_missing).reset_index()
 
         general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py
index e1ab4c97..1037da14 100644
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@@ -42,8 +42,8 @@ def app():
             # Fixed features - these are property attributes that shouldn't change over time
             fixed_data = {}
 
-            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
-            if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
+             # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
+            if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
                 continue
 
             # Map all anomaly values to None
@@ -70,7 +70,7 @@ def app():
             columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
                          "NUMBER_HEATED_ROOMS"]
             
-            if any(modified_property_data[columns_to_merge_on].isna()):
+            if modified_property_data[columns_to_merge_on].isna().values.any():
                 # If there are any NA value, back fill first (i.e most recent), then forward fill if needed
                 modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
             
@@ -80,12 +80,14 @@ def app():
 
             #  Get the corresponding groupby and merge, and fill in NA values
             cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
+            
             modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
             modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
             modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
             modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
 
             for field in AVERAGE_FIXED_FEATURES:
+
                 vals =  list(modified_property_data[field].dropna().unique())
                 if len(vals) > 1:
                     # Check the values are too far apart
@@ -93,11 +95,9 @@ def app():
                     if abs(vals[0] - vals[1]) / vals[0] > 0.1:
                         # Take the more recent value since it's likely to be more accurate
                         vals = [vals[-1]]
+         
 
-                if vals:
-                    field_value = np.mean(vals)
-                
-                fixed_data[field] = field_value
+                fixed_data[field] = np.mean(vals)
 
             #Combine all fields together
             fixed_data.update(mandatory_field_data)
@@ -122,11 +122,6 @@ def app():
                 rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
                 heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE]
 
-                # TODO: Should this be <= 0?
-                if rdsap_change == 0:
-                    # Assumption: We aren't interested in records that exhibit no change
-                    continue
-
                 # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
                 #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
                 #       within descriptions