mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
finished data differencing poc
This commit is contained in:
parent
00328d461b
commit
7956a4adc4
2 changed files with 110 additions and 83 deletions
|
|
@ -7,8 +7,6 @@ from model_data.simulation_system.core.Settings import (
|
|||
EARLIEST_EPC_DATE,
|
||||
FULLY_GLAZED_DESCRIPTIONS,
|
||||
AVERAGE_FIXED_FEATURES,
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE,
|
||||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE,
|
||||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
|
|
@ -17,8 +15,10 @@ from model_data.simulation_system.core.Settings import (
|
|||
COLUMNTYPES,
|
||||
RDSAP_RESPONSE,
|
||||
MAX_SAP_SCORE,
|
||||
fill_na_map
|
||||
fill_na_map,
|
||||
FIXED_DESCRIPTON_MAPPED_FEATURES
|
||||
)
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
|
|
@ -502,3 +502,108 @@ class DataProcessor:
|
|||
:return: Pandas dataframe containing the columns defined in FIXED_FEATURES
|
||||
"""
|
||||
return self.data[FIXED_FEATURES]
|
||||
|
||||
@staticmethod
|
||||
def difference_data(df):
|
||||
|
||||
"""
|
||||
Given a dataframe and starting and ending columns, this function will convert the features to
|
||||
differenced the ending subtract the starting value, which is useful for modelling the difference responces
|
||||
"""
|
||||
|
||||
columns = {
|
||||
x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
|
||||
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
|
||||
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
|
||||
]
|
||||
}
|
||||
|
||||
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
|
||||
non_numerical_columns = [col for col in non_numerical_columns if col in columns]
|
||||
levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
|
||||
|
||||
df = pd.get_dummies(df, columns=non_numerical_columns)
|
||||
|
||||
# We make sure there is a starting and ending version of the column
|
||||
diff_columns = []
|
||||
no_diff_columns = [] # Store for debugging
|
||||
for col in columns:
|
||||
if "_ENDING" in col:
|
||||
# Don't keep the endings
|
||||
continue
|
||||
else:
|
||||
# We have a starting column so check if we have an ending
|
||||
if col.replace("_STARTING", "") + "_ENDING" in columns:
|
||||
diff_columns.append(col)
|
||||
else:
|
||||
no_diff_columns.append(col)
|
||||
|
||||
if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
|
||||
raise Exception("Something went wrong, potentially missed a differencing colunn")
|
||||
|
||||
datatypes = df.dtypes
|
||||
|
||||
# Do the differencing
|
||||
cols_to_append = {}
|
||||
for starting_col in diff_columns:
|
||||
|
||||
base_col = starting_col.replace("_STARTING", "")
|
||||
|
||||
if "_STARTING" in starting_col:
|
||||
ending_col = starting_col.replace("_STARTING", "_ENDING")
|
||||
else:
|
||||
ending_col = starting_col + "_ENDING"
|
||||
|
||||
if starting_col not in non_numerical_columns:
|
||||
cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
|
||||
df = df.drop(columns=[starting_col, ending_col])
|
||||
continue
|
||||
|
||||
level_values = list(set(levels[starting_col] + levels[ending_col]))
|
||||
|
||||
level_cols = []
|
||||
for level in level_values:
|
||||
starting_level_col = "_".join([starting_col, str(level)])
|
||||
ending_level_col = "_".join([ending_col, str(level)])
|
||||
|
||||
if starting_level_col not in df.columns:
|
||||
# We have no starting, just ending
|
||||
col_type = datatypes[ending_level_col].name
|
||||
|
||||
if col_type == "bool":
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int)
|
||||
else:
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col]
|
||||
|
||||
level_cols.append(ending_level_col)
|
||||
|
||||
elif ending_level_col not in df.columns:
|
||||
# We have no ending, just starting
|
||||
col_type = datatypes[starting_level_col].name
|
||||
|
||||
if col_type == "bool":
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int)
|
||||
else:
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col]
|
||||
|
||||
level_cols.append(starting_level_col)
|
||||
|
||||
else:
|
||||
col_type = datatypes[starting_level_col].name
|
||||
|
||||
if col_type == "bool":
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = (
|
||||
df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
|
||||
)
|
||||
else:
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
|
||||
|
||||
level_cols.extend([starting_level_col, ending_level_col])
|
||||
|
||||
# Drop the columns
|
||||
df = df.drop(columns=level_cols)
|
||||
|
||||
cols_to_append = pd.DataFrame(cols_to_append)
|
||||
df = pd.concat([df, cols_to_append], axis=1)
|
||||
|
||||
return df
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_
|
|||
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
||||
from recommendations.recommendation_utils import (
|
||||
get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter_2_rooms, estimate_perimeter,
|
||||
extract_insulation_thickness, get_wall_type
|
||||
get_wall_type
|
||||
)
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
|
|
@ -539,85 +539,7 @@ def app():
|
|||
if pd.isnull(data_by_urpn_df).sum().sum():
|
||||
raise ValueError("Null values found in dataset after process_and_prune_desriptions")
|
||||
|
||||
# TODO: Move to dataprocesser
|
||||
def difference_data(df):
|
||||
|
||||
from model_data.simulation_system.core.Settings import FIXED_FEATURES, FIXED_DESCRIPTON_MAPPED_FEATURES
|
||||
|
||||
columns = {
|
||||
x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
|
||||
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
|
||||
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
|
||||
]
|
||||
}
|
||||
|
||||
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
|
||||
non_numerical_columns = [col for col in non_numerical_columns if col in columns]
|
||||
levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
|
||||
|
||||
df = pd.get_dummies(df, columns=non_numerical_columns)
|
||||
|
||||
# We make sure there is a starting and ending version of the column
|
||||
diff_columns = []
|
||||
no_diff_columns = [] # Store for debugging
|
||||
for col in columns:
|
||||
if "_ENDING" in col:
|
||||
# Don't keep the endings
|
||||
continue
|
||||
else:
|
||||
# We have a starting column so check if we have an ending
|
||||
if col.replace("_STARTING", "") + "_ENDING" in columns:
|
||||
diff_columns.append(col)
|
||||
else:
|
||||
no_diff_columns.append(col)
|
||||
|
||||
if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
|
||||
raise Exception("Something went wrong, potentially missed a differencing colunn")
|
||||
|
||||
datatypes = df.dtypes
|
||||
|
||||
# Do the differencing
|
||||
cols_to_append = {}
|
||||
for starting_col in diff_columns:
|
||||
|
||||
base_col = starting_col.replace("_STARTING", "")
|
||||
|
||||
if "_STARTING" in starting_col:
|
||||
ending_col = starting_col.replace("_STARTING", "_ENDING")
|
||||
else:
|
||||
ending_col = starting_col + "_ENDING"
|
||||
|
||||
if starting_col not in non_numerical_columns:
|
||||
cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
|
||||
df = df.drop(columns=[starting_col, ending_col])
|
||||
continue
|
||||
|
||||
level_values = list(set(levels[starting_col] + levels[ending_col]))
|
||||
|
||||
level_cols = []
|
||||
for level in level_values:
|
||||
starting_level_col = "_".join([starting_col, str(level)])
|
||||
ending_level_col = "_".join([ending_col, str(level)])
|
||||
|
||||
col_type = datatypes[starting_level_col].name
|
||||
|
||||
if starting_level_col not in df.columns:
|
||||
df[starting_level_col] = 0
|
||||
|
||||
if ending_level_col not in df.columns:
|
||||
df[ending_level_col] = 0
|
||||
|
||||
if col_type == "bool":
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = (
|
||||
df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
|
||||
)
|
||||
else:
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
|
||||
|
||||
level_cols.extend([starting_level_col, ending_level_col])
|
||||
|
||||
# Drop the columns
|
||||
df = df.drop(columns=level_cols)
|
||||
data_by_urpn_df = DataProcessor.difference_data(data_by_urpn_df)
|
||||
|
||||
dataset.append(data_by_urpn_df)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue