diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py index bd7e32fe..19b0a37e 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -7,8 +7,6 @@ from model_data.simulation_system.core.Settings import ( EARLIEST_EPC_DATE, FULLY_GLAZED_DESCRIPTIONS, AVERAGE_FIXED_FEATURES, - FLOOR_HEIGHT_NATIONAL_AVERAGE, - TOTAL_FLOOR_AREA_NATIONAL_AVERAGE, FLOOR_LEVEL_MAP, BUILT_FORM_REMAP, COLUMNS_TO_MERGE_ON, @@ -17,8 +15,10 @@ from model_data.simulation_system.core.Settings import ( COLUMNTYPES, RDSAP_RESPONSE, MAX_SAP_SCORE, - fill_na_map + fill_na_map, + FIXED_DESCRIPTON_MAPPED_FEATURES ) + from typing import List @@ -502,3 +502,108 @@ class DataProcessor: :return: Pandas dataframe containing the columns defined in FIXED_FEATURES """ return self.data[FIXED_FEATURES] + + @staticmethod + def difference_data(df): + + """ + Given a dataframe and starting and ending columns, this function will convert the features to + differenced the ending subtract the starting value, which is useful for modelling the difference responces + """ + + columns = { + x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [ + "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING", + "CARBON_STARTING", "UPRN", "CONSTITUENCY", + ] + } + + non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist() + non_numerical_columns = [col for col in non_numerical_columns if col in columns] + levels = {col: df[col].unique().tolist() for col in non_numerical_columns} + + df = pd.get_dummies(df, columns=non_numerical_columns) + + # We make sure there is a starting and ending version of the column + diff_columns = [] + no_diff_columns = [] # Store for debugging + for col in columns: + if "_ENDING" in col: + # Don't keep the endings + continue + else: + # We have a starting column so check if we have an ending + if col.replace("_STARTING", "") + "_ENDING" in columns: + diff_columns.append(col) + else: + no_diff_columns.append(col) + + if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns): + raise Exception("Something went wrong, potentially missed a differencing colunn") + + datatypes = df.dtypes + + # Do the differencing + cols_to_append = {} + for starting_col in diff_columns: + + base_col = starting_col.replace("_STARTING", "") + + if "_STARTING" in starting_col: + ending_col = starting_col.replace("_STARTING", "_ENDING") + else: + ending_col = starting_col + "_ENDING" + + if starting_col not in non_numerical_columns: + cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col] + df = df.drop(columns=[starting_col, ending_col]) + continue + + level_values = list(set(levels[starting_col] + levels[ending_col])) + + level_cols = [] + for level in level_values: + starting_level_col = "_".join([starting_col, str(level)]) + ending_level_col = "_".join([ending_col, str(level)]) + + if starting_level_col not in df.columns: + # We have no starting, just ending + col_type = datatypes[ending_level_col].name + + if col_type == "bool": + cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int) + else: + cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] + + level_cols.append(ending_level_col) + + elif ending_level_col not in df.columns: + # We have no ending, just starting + col_type = datatypes[starting_level_col].name + + if col_type == "bool": + cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int) + else: + cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col] + + level_cols.append(starting_level_col) + + else: + col_type = datatypes[starting_level_col].name + + if col_type == "bool": + cols_to_append[f"{base_col}_{level}_DIFF"] = ( + df[ending_level_col].astype(int) - df[starting_level_col].astype(int) + ) + else: + cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col] + + level_cols.extend([starting_level_col, ending_level_col]) + + # Drop the columns + df = df.drop(columns=level_cols) + + cols_to_append = pd.DataFrame(cols_to_append) + df = pd.concat([df, cols_to_append], axis=1) + + return df diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index 76f0042d..b986ce50 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -19,7 +19,7 @@ from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_ from recommendations.rdsap_tables import england_wales_age_band_lookup from recommendations.recommendation_utils import ( get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter_2_rooms, estimate_perimeter, - extract_insulation_thickness, get_wall_type + get_wall_type ) DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates" @@ -539,85 +539,7 @@ def app(): if pd.isnull(data_by_urpn_df).sum().sum(): raise ValueError("Null values found in dataset after process_and_prune_desriptions") - # TODO: Move to dataprocesser - def difference_data(df): - - from model_data.simulation_system.core.Settings import FIXED_FEATURES, FIXED_DESCRIPTON_MAPPED_FEATURES - - columns = { - x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [ - "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING", - "CARBON_STARTING", "UPRN", "CONSTITUENCY", - ] - } - - non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist() - non_numerical_columns = [col for col in non_numerical_columns if col in columns] - levels = {col: df[col].unique().tolist() for col in non_numerical_columns} - - df = pd.get_dummies(df, columns=non_numerical_columns) - - # We make sure there is a starting and ending version of the column - diff_columns = [] - no_diff_columns = [] # Store for debugging - for col in columns: - if "_ENDING" in col: - # Don't keep the endings - continue - else: - # We have a starting column so check if we have an ending - if col.replace("_STARTING", "") + "_ENDING" in columns: - diff_columns.append(col) - else: - no_diff_columns.append(col) - - if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns): - raise Exception("Something went wrong, potentially missed a differencing colunn") - - datatypes = df.dtypes - - # Do the differencing - cols_to_append = {} - for starting_col in diff_columns: - - base_col = starting_col.replace("_STARTING", "") - - if "_STARTING" in starting_col: - ending_col = starting_col.replace("_STARTING", "_ENDING") - else: - ending_col = starting_col + "_ENDING" - - if starting_col not in non_numerical_columns: - cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col] - df = df.drop(columns=[starting_col, ending_col]) - continue - - level_values = list(set(levels[starting_col] + levels[ending_col])) - - level_cols = [] - for level in level_values: - starting_level_col = "_".join([starting_col, str(level)]) - ending_level_col = "_".join([ending_col, str(level)]) - - col_type = datatypes[starting_level_col].name - - if starting_level_col not in df.columns: - df[starting_level_col] = 0 - - if ending_level_col not in df.columns: - df[ending_level_col] = 0 - - if col_type == "bool": - cols_to_append[f"{base_col}_{level}_DIFF"] = ( - df[ending_level_col].astype(int) - df[starting_level_col].astype(int) - ) - else: - cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col] - - level_cols.extend([starting_level_col, ending_level_col]) - - # Drop the columns - df = df.drop(columns=level_cols) + data_by_urpn_df = DataProcessor.difference_data(data_by_urpn_df) dataset.append(data_by_urpn_df)