diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/DataProcessor.py index 2aa0fabe..477883c4 100644 --- a/model_data/simulation_system/DataProcessor.py +++ b/model_data/simulation_system/DataProcessor.py @@ -1,13 +1,19 @@ from pathlib import Path +import numpy as np import pandas as pd +from model_data.BaseUtility import BaseUtility from simulation_system.Settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, FULLY_GLAZED_DESCRIPTIONS, AVERAGE_FIXED_FEATURES, FLOOR_HEIGHT_NATIONAL_AVERAGE, - TOTAL_FLOOR_AREA_NATIONAL_AVERAGE + TOTAL_FLOOR_AREA_NATIONAL_AVERAGE, + FLOOR_LEVEL_MAP, + BUILT_FORM_REMAP, + COLUMNS_TO_MERGE_ON ) +from typing import List class DataProcessor: @@ -32,11 +38,48 @@ class DataProcessor: self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings']) self.clean_multi_glaze_proportion() self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count']) + self.remap_columns() + + if DATA_PROCESSOR_SETTINGS['epc_minimum_count'] >= 1: + # If we have multiple EPC records, we can try and do filling + self.fill_na_fields() self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) return self.data + def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON): + """ + If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields + """ + # Each uprn can fille backward from recent and forward fill from oldest + # The groupby changes the order and we use the index to make the original data + filled_data = self.data.groupby("UPRN", group_keys=True)[columns_to_fill].apply( + lambda group: group.fillna(method='bfill').fillna(method='ffill') + ).reset_index().set_index('level_1').sort_index() + + self.data[columns_to_fill] = filled_data[columns_to_fill] + + + def remap_columns(self): + """ + Remap all columns, for any non values + """ + + # Map all anomaly values to None + data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) + + # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values + data = self.data.replace(data_anomaly_map) + data = data.replace(np.NAN, None) + + # Remap certain columns + data['FLOOR_LEVEL'] = data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) + data['BUILT_FROM'] = data['BUILT_FORM'].replace(BUILT_FORM_REMAP) + + self.data = data + + def make_cleaning_averages(self) -> pd.DataFrame: # Define a custom function to calculate the median, excluding missing values def median_without_missing(group): diff --git a/model_data/simulation_system/Settings.py b/model_data/simulation_system/Settings.py index 04e11c25..1d302abf 100644 --- a/model_data/simulation_system/Settings.py +++ b/model_data/simulation_system/Settings.py @@ -4,6 +4,14 @@ TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 +COLUMNS_TO_MERGE_ON = [ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTRUCTION_AGE_BAND", + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS" + ] + FULLY_GLAZED_DESCRIPTIONS = [ "Fully double glazed", "High performance glazing", @@ -111,4 +119,5 @@ DATA_PROCESSOR_SETTINGS = { 'low_memory': False, 'epc_minimum_count': 1, 'column_mappings': {'UPRN': [int, str]} -} \ No newline at end of file +} + diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index 1037da14..517460b0 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -10,6 +10,7 @@ from model_data.simulation_system.Settings import ( COMPONENT_FEATURES, RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, + COLUMNS_TO_MERGE_ON, FLOOR_LEVEL_MAP, BUILT_FORM_REMAP ) @@ -27,7 +28,9 @@ def app(): directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] dataset = [] - + # 116 + # 128048706 + # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic-certificates/domestic-E09000021-Kingston-upon-Thames') for directory in tqdm(directories): filepath = directory / "certificates.csv" @@ -46,42 +49,21 @@ def app(): if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: continue - # Map all anomaly values to None - data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) - - # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values - modified_property_data = property_data.replace(data_anomaly_map) - modified_property_data = modified_property_data.replace(np.NAN, None) - - # Remap certain columns - modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) - modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP) - # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS - latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict() - mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict() + latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict() + mandatory_field_data = property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict() # Taking just the last row, which is the percentage change from the latest to previous one only - # modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1 + # property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1 - # We can replace any NA values for Average fixed features - # We have columns that we want to merge on, but some of these columns are all NA values - # So we determine which columns to merge on, and get the equivalent grouping in the averages - columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", - "NUMBER_HEATED_ROOMS"] - - if modified_property_data[columns_to_merge_on].isna().values.any(): - # If there are any NA value, back fill first (i.e most recent), then forward fill if needed - modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill') - # Extract the columns that are not all None - na_columns = modified_property_data[columns_to_merge_on].isna().all() - columns_to_merge_on = na_columns.index[~na_columns].to_list() + na_columns = property_data[COLUMNS_TO_MERGE_ON].isna().all() + cleaned_columns_to_merge_on = na_columns.index[~na_columns].to_list() # Get the corresponding groupby and merge, and fill in NA values - cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean() + cleaning_averages_to_merge = cleaning_averages.groupby(cleaned_columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean() - modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE']) + modified_property_data = pd.merge(property_data, cleaning_averages_to_merge, on=cleaned_columns_to_merge_on, suffixes=['', '_AVERAGE']) modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE']) modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE']) modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) @@ -95,8 +77,10 @@ def app(): if abs(vals[0] - vals[1]) / vals[0] > 0.1: # Take the more recent value since it's likely to be more accurate vals = [vals[-1]] - + if len(vals) == 0: + wrong_var + fixed_data[field] = np.mean(vals) #Combine all fields together diff --git a/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet b/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet new file mode 100644 index 00000000..ac5249ce Binary files /dev/null and b/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet differ diff --git a/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet b/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet new file mode 100644 index 00000000..e7b2eb4a Binary files /dev/null and b/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet differ diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index cde310a3..da2c6f4a 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -99,18 +99,30 @@ def training(train_filepath: str, test_filepath: str) -> None: # logger.info('Split data into train and validation') logger.info('Build Model') - data = TabularDataset(data=train_df) + + data = TabularDataset(data=train_filepath) + data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE']) + TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT'] + # top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))] + + data = data[['RDSAP_CHANGE'] + top_features.to_list()] + # data = TabularDataset(data=train_df) # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float) - subsample_size = round(len(data)/4) + subsample_size = round(len(data)/20) data = data.sample(subsample_size, random_state=RANDOM_SEED) + # Add custom metric class MAPE + # Have a look at temporal features + target_column = 'RDSAP_CHANGE' predictor_RDSAP = TabularPredictor( label=target_column, path="agModels-predictRDSAP", problem_type="regression", eval_metric='mean_absolute_error' - ).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN']) + ).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN']) + + logger.info('Evaluate matrics')