From e95b6e336913e60cc08c2e31d1b33db1a992b54e Mon Sep 17 00:00:00 2001
From: Michael Duong <michael123ster@gmail.com>
Date: Fri, 11 Aug 2023 18:48:39 +0000
Subject: [PATCH] removed for loop and if condition

---
 model_data/simulation_system/app.py | 97 ++++++++++++++---------------
 1 file changed, 48 insertions(+), 49 deletions(-)

diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py
index c11e70eb..688d9cce 100644
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@@ -70,7 +70,6 @@ LATEST_FIELD = [
     "NUMBER_HABITABLE_ROOMS",
     "NUMBER_HEATED_ROOMS",
     "FIXED_LIGHTING_OUTLETS_COUNT",
-    "CONSTRUCTION_AGE_BAND",
     "FLOOR_LEVEL",
     "CONSTRUCTION_AGE_BAND",  # This is a field we're probably want to use verisk data for
 ]
@@ -232,11 +231,10 @@ class DataProcessor:
 
         # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
 
-
-        self.data = self.data[~pd.isnull(self.data["UPRN"])] \
-            [self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \
-            [self.data["TRANSACTION_TYPE"] != "new dwelling"] \
-            [~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
+        self.data = self.data[~pd.isnull(self.data["UPRN"])]
+        self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
+        self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
+        self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
 
     
     def clean_multi_glaze_proportion(self) -> None:
@@ -270,73 +268,74 @@ def app():
         for uprn, property_data in df.groupby("UPRN", observed=True):
 
             # Fixed features - these are property attributes that shouldn't change over time
-
-            
-            ignore_epc = False
             fixed_data = {}
-            for field in FIXED_FEATURES:
-                vals = property_data[field].dropna().unique()
-                # Remove invalid values
-                vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]
 
-                if field == "FLOOR_LEVEL":
-                    vals = list({FLOOR_LEVEL_MAP[v] for v in vals})
+            # Map all anomaly values to None
+            data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
+            
+            # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
+            modified_property_data = property_data.replace(data_anomaly_map)
+            modified_property_data = modified_property_data.replace(np.NAN, None)
 
-                if field == "BUILT_FORM":
-                    vals = list({BUILT_FORM_REMAP.get(v, v) for v in vals})
+            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
+            if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
+                continue
 
-                if field in AVERAGE_FIXED_FEATURES:
+            mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
+            
+            # Remap certain columns
+            modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
+            modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
 
-                    if len(vals) > 1:
-                        # Check the values are too far apart
-                        if abs(vals[0] - vals[1]) / vals[0] > 0.1:
-                            # Take the more recent value since it's likely to be more accurate
-                            vals = [vals[-1]]
+            latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
 
-                    if vals:
-                        field_value = np.mean(vals)
-                    else:
-                        # Clean using averages
+            # Taking just the last row, which is the percentage change from the latest to previous one only
+            # modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
 
-                        avgs = iterative_filtering(cleaning_averages, property_data)
-                        # TODO: Should probably do a mean/median?
-                        field_value = avgs[field].iloc[0]
 
-                        if pd.isnull(field_value):
-                            # Just the use the general averages
-                            field_value = general_averages[
-                                (general_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
-                                (general_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0])
-                                ][field].iloc[0]
 
-                elif field in LATEST_FIELD:
-                    field_value = vals[-1] if vals else None
+            for field in AVERAGE_FIXED_FEATURES:
+                vals =  list(modified_property_data[field].dropna().unique())
+                if len(vals) > 1:
+                    # Check the values are too far apart
+                    if abs(vals[0] - vals[1]) / vals[0] > 0.1:
+                        # Take the more recent value since it's likely to be more accurate
+                        vals = [vals[-1]]
+
+                if vals:
+                    field_value = np.mean(vals)
                 else:
-                    if len(vals) > 1:
-                        if field in MANDATORY_FIXED_FEATURES:
-                            ignore_epc = True
-                        else:
-                            raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
+                    # Clean using averages
 
-                    field_value = vals[0] if vals else None
+                    avgs = iterative_filtering(cleaning_averages, modified_property_data)
+                    # TODO: Should probably do a mean/median?
+                    field_value = avgs[field].iloc[0]
 
+                    if pd.isnull(field_value):
+                        # Just the use the general averages
+                        field_value = general_averages[
+                            (general_averages["PROPERTY_TYPE"] == modified_property_data["PROPERTY_TYPE"].iloc[0]) &
+                            (general_averages["BUILT_FORM"] == modified_property_data["BUILT_FORM"].iloc[0])
+                            ][field].iloc[0]
+                        
                 fixed_data[field] = field_value
 
-            if ignore_epc:
-                continue
+            #Combine all fields together
+            fixed_data.update(mandatory_field_data)
+            fixed_data.update(latest_field_data)
 
             # We include the lodgement date here as we probably need to factor time into the
             # model, since EPC standards and rigour have changed over time
-            variable_data = property_data[
+            variable_data = modified_property_data[
                 COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
                 ]
 
             # Note: we look at changes between subsequent EPCS, however we could look at other permutations
             # e.g. first vs second, second vs third and also first vs third
             property_model_data = []
-            for idx in range(0, property_data.shape[0] - 1):
+            for idx in range(0, modified_property_data.shape[0] - 1):
 
-                if idx >= property_data.shape[0] - 1:
+                if idx >= modified_property_data.shape[0] - 1:
                     break
 
                 starting_record = variable_data.iloc[idx]