add a test data generation script

This commit is contained in:
Michael Duong 2023-08-12 19:11:22 +00:00
parent e3be4ba344
commit db67e0e23f
9 changed files with 135 additions and 9 deletions

View file

@ -1,6 +1,6 @@
from pathlib import Path
import pandas as pd
from settings import (
from simulation_system.Settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS,

View file

@ -0,0 +1,22 @@
import logging
def setup_logger():
# Create a logger
logger = logging.getLogger()
# Set the log level
logger.setLevel(logging.INFO)
# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# Create a stream handler to direct logs to stdout
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
# Add the stream handler to the logger
logger.addHandler(stream_handler)
return logger
logger = setup_logger()

View file

@ -3,7 +3,7 @@ import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import BaseUtility
from pathlib import Path
from settings import (
from simulation_system.Settings import (
MANDATORY_FIXED_FEATURES,
AVERAGE_FIXED_FEATURES,
LATEST_FIELD,
@ -29,7 +29,7 @@ def app():
dataset = []
for directory in tqdm(directories):
for directory in tqdm(directories):
filepath = directory / "certificates.csv"
@ -75,7 +75,7 @@ def app():
# If there are any NA value, back fill first (i.e most recent), then forward fill if needed
modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
# Extract the columns that are non all None
# Extract the columns that are not all None
na_columns = modified_property_data[columns_to_merge_on].isna().all()
columns_to_merge_on = na_columns.index[~na_columns].to_list()

View file

@ -0,0 +1,77 @@
from Logger import logger
import argparse
import pandas as pd
from pathlib import Path
RANDOM_SEED = 0
def ingest_arguments() -> argparse.Namespace:
"""
Helper function to take in arguments from script start
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load', required=True)
parser.add_argument('--output-folder', type=str, help='Location of Parquet dataset to save', required=True)
parser.add_argument('--percentage', type=float, help='Percentage of data to use as test data', default=None)
parser.add_argument('--volume', type=int, help='Volume of data to use as test data', default=None)
parser.add_argument('--sampling', type=str, help='Type of sampling to do for test data', choices=['random', 'stratified'], default='random')
args = parser.parse_args()
return args
def main(filepath: str, output_folder: str, percentage: float, volume: int, sampling: str):
"""
Load a dataset in and split out the training+validation data and the test data.
"""
logger.info('---Loading Data---')
data = pd.read_parquet(filepath)
if percentage and volume is None:
test_amount = round(len(data)*percentage)
elif percentage is None and volume:
test_amount = volume
elif percentage is None and volume is None:
logger.error('No amount specified - please specify either a percentage or volume')
exit(1)
else:
logger.info('Both percentage and volume specified - taking largest of the two')
test_amount = max(round(len(data)*percentage), volume)
logger.info(f'---Extracting {test_amount} from dataset to be test data')
if sampling == 'random':
logger.info('--- Using random sample method ---')
sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index
train_validation_data = data.drop(sample_index)
test_data = data.loc[sample_index]
elif sampling =='stratified':
# Not yet implemented
pass
logger.info('--- Saving data ---')
train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
test_data.to_parquet(Path(output_folder)/'test_data.parquet')
logger.info(' ---Pipeline complete---')
if __name__ == "__main__":
logger.info('--- Generate test data pipeline ---')
args = ingest_arguments()
main(
filepath=args.filepath,
output_folder=args.output_folder,
percentage=args.percentage,
volume=args.volume,
sampling=args.sampling
)

View file

@ -1,20 +1,47 @@
import os
from logging import Logger
import pandas as pd
import argparse
from Logger import logger
from autogluon.tabular import TabularDataset, TabularPredictor
logger = Logger(__name__)
def ingest_arguments() -> argparse.Namespace:
"""
Helper function to take in arguments from script start
"""
def training():
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load')
args = parser.parse_args()
return args
def training(filepath: str) -> None:
"""
Pipeline to run training on the dataset
"""
logger.info('Loading data')
data = pd.read_parquet(filepath)
logger.info('Feature selection')
feature_columns = data.columns
data = data[feature_columns]
logger.info('Split data into train and validation')
logger.info('Build Model')
logger.info('Evaluate matrics')
if __name__ == "__main__":
training()
logger.info('---Begin Pipeline---')
logger.info('---Ingest Arguments---')
args = ingest_arguments()
training(filepath=args.filepath)