diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/DataProcessor.py index 1f09a2aa..4b2202e8 100644 --- a/model_data/simulation_system/DataProcessor.py +++ b/model_data/simulation_system/DataProcessor.py @@ -1,6 +1,6 @@ from pathlib import Path import pandas as pd -from settings import ( +from simulation_system.Settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, FULLY_GLAZED_DESCRIPTIONS, diff --git a/model_data/simulation_system/Logger.py b/model_data/simulation_system/Logger.py new file mode 100644 index 00000000..5197e7ce --- /dev/null +++ b/model_data/simulation_system/Logger.py @@ -0,0 +1,22 @@ +import logging + +def setup_logger(): + # Create a logger + logger = logging.getLogger() + + # Set the log level + logger.setLevel(logging.INFO) + + # Create a formatter + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + + # Create a stream handler to direct logs to stdout + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + + # Add the stream handler to the logger + logger.addHandler(stream_handler) + + return logger + +logger = setup_logger() \ No newline at end of file diff --git a/model_data/simulation_system/settings.py b/model_data/simulation_system/Settings.py similarity index 100% rename from model_data/simulation_system/settings.py rename to model_data/simulation_system/Settings.py diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index cdd50227..bb6f80bf 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -3,7 +3,7 @@ import pandas as pd from tqdm import tqdm from model_data.BaseUtility import BaseUtility from pathlib import Path -from settings import ( +from simulation_system.Settings import ( MANDATORY_FIXED_FEATURES, AVERAGE_FIXED_FEATURES, LATEST_FIELD, @@ -29,7 +29,7 @@ def app(): dataset = [] - for directory in tqdm(directories): + for directory in tqdm(directories): filepath = directory / "certificates.csv" @@ -75,7 +75,7 @@ def app(): # If there are any NA value, back fill first (i.e most recent), then forward fill if needed modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill') - # Extract the columns that are non all None + # Extract the columns that are not all None na_columns = modified_property_data[columns_to_merge_on].isna().all() columns_to_merge_on = na_columns.index[~na_columns].to_list() diff --git a/model_data/simulation_system/model_build_data/test_data.parquet b/model_data/simulation_system/model_build_data/test_data.parquet new file mode 100644 index 00000000..e2995d28 Binary files /dev/null and b/model_data/simulation_system/model_build_data/test_data.parquet differ diff --git a/model_data/simulation_system/model_build_data/train_validation_data.parquet b/model_data/simulation_system/model_build_data/train_validation_data.parquet new file mode 100644 index 00000000..d56d5cbb Binary files /dev/null and b/model_data/simulation_system/model_build_data/train_validation_data.parquet differ diff --git a/model_data/simulation_system/preprocessed_data/dataset.parquet b/model_data/simulation_system/preprocessed_data/dataset.parquet new file mode 100644 index 00000000..4b6247d6 Binary files /dev/null and b/model_data/simulation_system/preprocessed_data/dataset.parquet differ diff --git a/model_data/simulation_system/test_data_generation.py b/model_data/simulation_system/test_data_generation.py new file mode 100644 index 00000000..a3e47a65 --- /dev/null +++ b/model_data/simulation_system/test_data_generation.py @@ -0,0 +1,77 @@ +from Logger import logger +import argparse +import pandas as pd +from pathlib import Path + +RANDOM_SEED = 0 + +def ingest_arguments() -> argparse.Namespace: + """ + Helper function to take in arguments from script start + """ + + parser = argparse.ArgumentParser(description='Inputs for training script') + + parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load', required=True) + parser.add_argument('--output-folder', type=str, help='Location of Parquet dataset to save', required=True) + parser.add_argument('--percentage', type=float, help='Percentage of data to use as test data', default=None) + parser.add_argument('--volume', type=int, help='Volume of data to use as test data', default=None) + parser.add_argument('--sampling', type=str, help='Type of sampling to do for test data', choices=['random', 'stratified'], default='random') + + args = parser.parse_args() + + return args + +def main(filepath: str, output_folder: str, percentage: float, volume: int, sampling: str): + """ + Load a dataset in and split out the training+validation data and the test data. + """ + + logger.info('---Loading Data---') + data = pd.read_parquet(filepath) + + if percentage and volume is None: + test_amount = round(len(data)*percentage) + elif percentage is None and volume: + test_amount = volume + elif percentage is None and volume is None: + logger.error('No amount specified - please specify either a percentage or volume') + exit(1) + else: + logger.info('Both percentage and volume specified - taking largest of the two') + test_amount = max(round(len(data)*percentage), volume) + + logger.info(f'---Extracting {test_amount} from dataset to be test data') + + if sampling == 'random': + logger.info('--- Using random sample method ---') + sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index + + train_validation_data = data.drop(sample_index) + test_data = data.loc[sample_index] + + elif sampling =='stratified': + # Not yet implemented + pass + + logger.info('--- Saving data ---') + + train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet') + test_data.to_parquet(Path(output_folder)/'test_data.parquet') + + logger.info(' ---Pipeline complete---') + +if __name__ == "__main__": + + logger.info('--- Generate test data pipeline ---') + + args = ingest_arguments() + + main( + filepath=args.filepath, + output_folder=args.output_folder, + percentage=args.percentage, + volume=args.volume, + sampling=args.sampling + ) + diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index 72b7aba7..d6846183 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,20 +1,47 @@ import os -from logging import Logger +import pandas as pd +import argparse +from Logger import logger +from autogluon.tabular import TabularDataset, TabularPredictor -logger = Logger(__name__) +def ingest_arguments() -> argparse.Namespace: + """ + Helper function to take in arguments from script start + """ -def training(): + parser = argparse.ArgumentParser(description='Inputs for training script') + + parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load') + + args = parser.parse_args() + + return args + +def training(filepath: str) -> None: """ Pipeline to run training on the dataset """ logger.info('Loading data') - + data = pd.read_parquet(filepath) + logger.info('Feature selection') + feature_columns = data.columns + data = data[feature_columns] + + logger.info('Split data into train and validation') + logger.info('Build Model') + logger.info('Evaluate matrics') if __name__ == "__main__": - training() \ No newline at end of file + + logger.info('---Begin Pipeline---') + + logger.info('---Ingest Arguments---') + args = ingest_arguments() + + training(filepath=args.filepath) \ No newline at end of file