From 00328d461b01d6acb7681f249431f3ea3d0c708d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 21 Sep 2023 18:32:10 +0100 Subject: [PATCH] working on data diferencing code --- .../simulation_system/core/DataProcessor.py | 4 + model_data/simulation_system/core/Settings.py | 10 ++- .../generate_rdsap_change.py | 82 +++++++++++++++++++ recommendations/recommendation_utils.py | 17 +++- 4 files changed, 109 insertions(+), 4 deletions(-) diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py index 878778de..bd7e32fe 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -258,6 +258,10 @@ class DataProcessor: data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP) data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP) + convert_to_lower = ["TRANSACTION_TYPE"] + for col in convert_to_lower: + data[col] = data[col].str.lower() + self.data = data def make_cleaning_averages(self) -> pd.DataFrame: diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py index 935ae940..8b5252c5 100644 --- a/model_data/simulation_system/core/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -83,8 +83,6 @@ FIXED_FEATURES = [ "CONSTITUENCY", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", - "FLOOR_HEIGHT" - "TOTAL_FLOOR_AREA", ] COMPONENT_FEATURES = [ @@ -213,3 +211,11 @@ fill_na_map = { "EXTENSION_COUNT": 0, "NUMBER_OPEN_FIREPLACES": 0 } + +# After the property descriptions have been re-remapped, we expect these features to be fixed +FIXED_DESCRIPTON_MAPPED_FEATURES = [ + 'another_property_below', 'is_roof_room', 'is_granite_or_whinstone', 'is_flat', 'is_suspended', + 'has_dwelling_above', 'is_as_built', 'is_to_external_air', 'is_cob', 'is_pitched', 'is_solid', 'is_at_rafters', + 'is_solid_brick', 'is_loft', 'is_system_built', 'is_timber_frame', 'is_sandstone_or_limestone', 'is_filled_cavity', + 'is_cavity_wall', 'is_thatched', 'is_to_unheated_space' +] diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index f4af4d06..76f0042d 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -509,6 +509,8 @@ def app(): pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE) ).dt.days + data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"]) + # We look for key building fabric features that have changed from one EPC to the next. # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we # remove this record, as it indicates that the quality of the EPC conducted in the first instance @@ -537,6 +539,86 @@ def app(): if pd.isnull(data_by_urpn_df).sum().sum(): raise ValueError("Null values found in dataset after process_and_prune_desriptions") + # TODO: Move to dataprocesser + def difference_data(df): + + from model_data.simulation_system.core.Settings import FIXED_FEATURES, FIXED_DESCRIPTON_MAPPED_FEATURES + + columns = { + x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [ + "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING", + "CARBON_STARTING", "UPRN", "CONSTITUENCY", + ] + } + + non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist() + non_numerical_columns = [col for col in non_numerical_columns if col in columns] + levels = {col: df[col].unique().tolist() for col in non_numerical_columns} + + df = pd.get_dummies(df, columns=non_numerical_columns) + + # We make sure there is a starting and ending version of the column + diff_columns = [] + no_diff_columns = [] # Store for debugging + for col in columns: + if "_ENDING" in col: + # Don't keep the endings + continue + else: + # We have a starting column so check if we have an ending + if col.replace("_STARTING", "") + "_ENDING" in columns: + diff_columns.append(col) + else: + no_diff_columns.append(col) + + if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns): + raise Exception("Something went wrong, potentially missed a differencing colunn") + + datatypes = df.dtypes + + # Do the differencing + cols_to_append = {} + for starting_col in diff_columns: + + base_col = starting_col.replace("_STARTING", "") + + if "_STARTING" in starting_col: + ending_col = starting_col.replace("_STARTING", "_ENDING") + else: + ending_col = starting_col + "_ENDING" + + if starting_col not in non_numerical_columns: + cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col] + df = df.drop(columns=[starting_col, ending_col]) + continue + + level_values = list(set(levels[starting_col] + levels[ending_col])) + + level_cols = [] + for level in level_values: + starting_level_col = "_".join([starting_col, str(level)]) + ending_level_col = "_".join([ending_col, str(level)]) + + col_type = datatypes[starting_level_col].name + + if starting_level_col not in df.columns: + df[starting_level_col] = 0 + + if ending_level_col not in df.columns: + df[ending_level_col] = 0 + + if col_type == "bool": + cols_to_append[f"{base_col}_{level}_DIFF"] = ( + df[ending_level_col].astype(int) - df[starting_level_col].astype(int) + ) + else: + cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col] + + level_cols.extend([starting_level_col, ending_level_col]) + + # Drop the columns + df = df.drop(columns=level_cols) + dataset.append(data_by_urpn_df) cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0] diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 400f75c9..c121443b 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -1,5 +1,8 @@ import math from copy import deepcopy + +import pandas as pd + from backend.Property import Property from statistics import mean from recommendations.rdsap_tables import ( @@ -211,6 +214,11 @@ def get_wall_u_value(clean_description, age_band, is_granite_or_whinstone, is_sa mapped_description = epc_wall_description_map[clean_description] mapped_value = wall_uvalues_df[wall_uvalues_df["Wall_type"] == mapped_description][age_band].values[0] + + if pd.isnull(mapped_value) and "Park home" in mapped_description: + # We don't know enough in this case so we default to 0 + return 0 + if mapped_value == "a": # The rdSap documentation indicateswe should use a formula to calculate the u-value return float( @@ -231,7 +239,8 @@ def get_wall_u_value(clean_description, age_band, is_granite_or_whinstone, is_sa return min(potential_uvalue, formula_uvalue) if mapped_value == "s1.1.2": - return None + # We don't know enough in this case so we default to 0 + return 0 return float(mapped_value) @@ -410,7 +419,11 @@ def get_floor_u_value(floor_type, area, perimeter, age_band, wall_type, insulati Rse = 0.04 # in m²K/W lambda_ins = 0.035 # thermal conductivity of floor insulation in W/m·K - wall_thickness = [x[age_band] for x in default_wall_thickness if x["type"] == wall_type][0] / 1000 + wall_thickness = [x[age_band] for x in default_wall_thickness if x["type"] == wall_type][0] + if wall_thickness is None and wall_type == "park home": + # We don't know enough and likely won't make recommendations + return 0 + wall_thickness = wall_thickness / 1000 if insulation_thickness is None: insulation_lookup = s11[s11["Age_band"].str.contains(age_band) & s11["Floor_construction"] == floor_type]