From e95b6e336913e60cc08c2e31d1b33db1a992b54e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 11 Aug 2023 18:48:39 +0000 Subject: [PATCH] removed for loop and if condition --- model_data/simulation_system/app.py | 97 ++++++++++++++--------------- 1 file changed, 48 insertions(+), 49 deletions(-) diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index c11e70eb..688d9cce 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -70,7 +70,6 @@ LATEST_FIELD = [ "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", - "CONSTRUCTION_AGE_BAND", "FLOOR_LEVEL", "CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for ] @@ -232,11 +231,10 @@ class DataProcessor: # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous - - self.data = self.data[~pd.isnull(self.data["UPRN"])] \ - [self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \ - [self.data["TRANSACTION_TYPE"] != "new dwelling"] \ - [~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] + self.data = self.data[~pd.isnull(self.data["UPRN"])] + self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] + self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"] + self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] def clean_multi_glaze_proportion(self) -> None: @@ -270,73 +268,74 @@ def app(): for uprn, property_data in df.groupby("UPRN", observed=True): # Fixed features - these are property attributes that shouldn't change over time - - - ignore_epc = False fixed_data = {} - for field in FIXED_FEATURES: - vals = property_data[field].dropna().unique() - # Remove invalid values - vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES] - if field == "FLOOR_LEVEL": - vals = list({FLOOR_LEVEL_MAP[v] for v in vals}) + # Map all anomaly values to None + data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) + + # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values + modified_property_data = property_data.replace(data_anomaly_map) + modified_property_data = modified_property_data.replace(np.NAN, None) - if field == "BUILT_FORM": - vals = list({BUILT_FORM_REMAP.get(v, v) for v in vals}) + # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row + if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: + continue - if field in AVERAGE_FIXED_FEATURES: + mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict() + + # Remap certain columns + modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) + modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP) - if len(vals) > 1: - # Check the values are too far apart - if abs(vals[0] - vals[1]) / vals[0] > 0.1: - # Take the more recent value since it's likely to be more accurate - vals = [vals[-1]] + latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict() - if vals: - field_value = np.mean(vals) - else: - # Clean using averages + # Taking just the last row, which is the percentage change from the latest to previous one only + # modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1 - avgs = iterative_filtering(cleaning_averages, property_data) - # TODO: Should probably do a mean/median? - field_value = avgs[field].iloc[0] - if pd.isnull(field_value): - # Just the use the general averages - field_value = general_averages[ - (general_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) & - (general_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) - ][field].iloc[0] - elif field in LATEST_FIELD: - field_value = vals[-1] if vals else None + for field in AVERAGE_FIXED_FEATURES: + vals = list(modified_property_data[field].dropna().unique()) + if len(vals) > 1: + # Check the values are too far apart + if abs(vals[0] - vals[1]) / vals[0] > 0.1: + # Take the more recent value since it's likely to be more accurate + vals = [vals[-1]] + + if vals: + field_value = np.mean(vals) else: - if len(vals) > 1: - if field in MANDATORY_FIXED_FEATURES: - ignore_epc = True - else: - raise ValueError("Fixed feature {} has more than one value - fix me".format(field)) + # Clean using averages - field_value = vals[0] if vals else None + avgs = iterative_filtering(cleaning_averages, modified_property_data) + # TODO: Should probably do a mean/median? + field_value = avgs[field].iloc[0] + if pd.isnull(field_value): + # Just the use the general averages + field_value = general_averages[ + (general_averages["PROPERTY_TYPE"] == modified_property_data["PROPERTY_TYPE"].iloc[0]) & + (general_averages["BUILT_FORM"] == modified_property_data["BUILT_FORM"].iloc[0]) + ][field].iloc[0] + fixed_data[field] = field_value - if ignore_epc: - continue + #Combine all fields together + fixed_data.update(mandatory_field_data) + fixed_data.update(latest_field_data) # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time - variable_data = property_data[ + variable_data = modified_property_data[ COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE] ] # Note: we look at changes between subsequent EPCS, however we could look at other permutations # e.g. first vs second, second vs third and also first vs third property_model_data = [] - for idx in range(0, property_data.shape[0] - 1): + for idx in range(0, modified_property_data.shape[0] - 1): - if idx >= property_data.shape[0] - 1: + if idx >= modified_property_data.shape[0] - 1: break starting_record = variable_data.iloc[idx]