diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py index 8e2a7e25..4a361196 100644 --- a/model_data/simulation_system/energy_predictor.py +++ b/model_data/simulation_system/energy_predictor.py @@ -4,26 +4,46 @@ from Settings import ( FLOOR_LEVEL_MAP, BUILT_FORM_REMAP, EARLIEST_EPC_DATE, - FULLY_GLAZED_DESCRIPTIONS + FULLY_GLAZED_DESCRIPTIONS, + FIXED_FEATURES, + LATEST_FIELD, + COMPONENT_FEATURES ) from model_data.BaseUtility import BaseUtility from tqdm import tqdm import pandas as pd import numpy as np +from autogluon.tabular import TabularDataset, TabularPredictor + +RANDOM_SEED = 0 + DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' -def main(): +FLOAT_COLUMNS = [ + 'NUMBER_OPEN_FIREPLACES', + 'EXTENSION_COUNT', + 'TOTAL_FLOOR_AREA', + 'PHOTO_SUPPLY', + 'FIXED_LIGHTING_OUTLETS_COUNT', + 'FLOOR_HEIGHT', + 'NUMBER_HABITABLE_ROOMS', + 'LOW_ENERGY_LIGHTING', + 'MULTI_GLAZE_PROPORTION', + 'NUMBER_HEATED_ROOMS' + ] + +def create_raw_data(): """ Extract all information to do a simple predictor for RDSAP """ directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] - directories = directories[0:10] + # directories = directories[0:10] dfs = [] for directory in tqdm(directories): filepath = directory / "certificates.csv" - df = pd.read_csv(filepath) + df = pd.read_csv(filepath, low_memory=False) # Remove any bad uprns and ignore old/bad data df = df[~pd.isnull(df["UPRN"])] @@ -53,11 +73,45 @@ def main(): df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP) + # Keep only possible modelling columns + df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))] + + # Reduce memory usage + + # df.memory_usage() + # df.dtypes + df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned') + df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float') + + dfs.append(df) data = pd.concat(dfs) data.to_parquet('./energy_predictor_data.parquet') + cleaned_data = data.dropna() + # GIves you primarily flats + cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet') + + +def main(): + + data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet') + + subsample_size = round(len(data)/100) + data = data.sample(subsample_size, random_state=RANDOM_SEED) + + predictor_RDSAP = TabularPredictor( + label=RDSAP_RESPONSE, + path="agModels-predictENERGY", + problem_type="regression", + eval_metric='mean_absolute_error' + ).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT']) + + test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet') + performance = predictor_RDSAP.evaluate(test_data) + predictions = predictor_RDSAP.predict(test_data) + predictor_RDSAP.feature_importance(test_data) if __name__ == "__main__": main() \ No newline at end of file diff --git a/model_data/simulation_system/model_build_data/test_data.parquet b/model_data/simulation_system/model_build_data/test_data.parquet deleted file mode 100644 index e2995d28..00000000 Binary files a/model_data/simulation_system/model_build_data/test_data.parquet and /dev/null differ diff --git a/model_data/simulation_system/model_build_data/train_validation_data.parquet b/model_data/simulation_system/model_build_data/train_validation_data.parquet deleted file mode 100644 index d56d5cbb..00000000 Binary files a/model_data/simulation_system/model_build_data/train_validation_data.parquet and /dev/null differ diff --git a/model_data/simulation_system/test_data_generation.py b/model_data/simulation_system/test_data_generation.py index a3e47a65..fb7d7c64 100644 --- a/model_data/simulation_system/test_data_generation.py +++ b/model_data/simulation_system/test_data_generation.py @@ -28,7 +28,7 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp """ logger.info('---Loading Data---') - data = pd.read_parquet(filepath) + data = pd.read_parquet(filepath).reset_index(drop=True) if percentage and volume is None: test_amount = round(len(data)*percentage) @@ -48,7 +48,7 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index train_validation_data = data.drop(sample_index) - test_data = data.loc[sample_index] + test_data = data.iloc[sample_index] elif sampling =='stratified': # Not yet implemented diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index d6846183..cde310a3 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,9 +1,20 @@ import os import pandas as pd import argparse +from typing import List from Logger import logger from autogluon.tabular import TabularDataset, TabularPredictor + +DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE'] +FEATURE_COLUMNS = None +RANDOM_SEED = 0 + +# FOR TESTING +train_filepath = "./model_build_data/train_validation_data.parquet" +test_filepath = "./model_build_data/test_data.parquet" + + def ingest_arguments() -> argparse.Namespace: """ Helper function to take in arguments from script start @@ -11,32 +22,105 @@ def ingest_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser(description='Inputs for training script') - parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load') + parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training') + parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing') args = parser.parse_args() return args -def training(filepath: str) -> None: + +class DataLoader(): + + @staticmethod + def load(filepath: str) -> pd.DataFrame: + """ + Load different datasets + """ + if filepath.endswith('.parquet'): + df = pd.read_parquet(filepath) + elif filepath.endswith('.csv.'): + df = pd.read_csv(filepath) + else: + logger.error('Not implemented!') + exit(1) + + return df + +class FeatureProcessor: + """ + Handle all feature manipulation before modelling + """ + + @staticmethod + def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame: + df = df.drop(columns=[drop_columns]) + return df + + def retain_features(df: pd.DataFrame, features: List[str] = None): + """ + Determine which columns to keep ofr modelling + """ + if features is None: + features = df.columns + else: + if not set(features).issubset(df.columns): + logger.error('Features defined is not contained in data') + exit(1) + + df = df[features] + + return df + + def process(self, df: pd.DataFrame) -> pd.DataFrame: + df = self.drop_columns(df, drop_columns=DROP_COLUMNS) + df = self.retain_features(df, features=FEATURE_COLUMNS) + return df + + + +def training(train_filepath: str, test_filepath: str) -> None: """ Pipeline to run training on the dataset """ logger.info('Loading data') - data = pd.read_parquet(filepath) + dataloader = DataLoader() + train_df = dataloader.load(filepath=train_filepath) + test_df = dataloader.load(filepath=test_filepath) + + # df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE']) - logger.info('Feature selection') - feature_columns = data.columns - data = data[feature_columns] - - logger.info('Split data into train and validation') + logger.info('Feature processing') + feature_processor = FeatureProcessor() + train_df = feature_processor.process(train_df) + test_df = feature_processor.process(test_df) + # logger.info('Split data into train and validation') logger.info('Build Model') - + data = TabularDataset(data=train_df) + # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float) + subsample_size = round(len(data)/4) + data = data.sample(subsample_size, random_state=RANDOM_SEED) + + target_column = 'RDSAP_CHANGE' + predictor_RDSAP = TabularPredictor( + label=target_column, + path="agModels-predictRDSAP", + problem_type="regression", + eval_metric='mean_absolute_error' + ).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN']) logger.info('Evaluate matrics') + test_data = TabularDataset('./model_build_data/test_data.parquet') + performance = predictor_RDSAP.evaluate(test_data) + predictions = predictor_RDSAP.predict(test_data) + + test_data['predictions'] = predictions + test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions']) + if __name__ == "__main__": logger.info('---Begin Pipeline---') @@ -44,4 +128,4 @@ if __name__ == "__main__": logger.info('---Ingest Arguments---') args = ingest_arguments() - training(filepath=args.filepath) \ No newline at end of file + training(train_filepath=args.train_filepath, test_filepath=args.test_filepath) \ No newline at end of file