Model/model_data/simulation_system/training.py
2023-08-15 18:18:55 +01:00

114 lines
No EOL
3.5 KiB
Python

import os
import pandas as pd
import argparse
from typing import List
from Logger import logger
from DataLoader import DataLoader
from autogluon.tabular import TabularDataset, TabularPredictor
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
FEATURE_COLUMNS = None
RANDOM_SEED = 0
# FOR TESTING
train_filepath = "./model_build_data/train_validation_data.parquet"
test_filepath = "./model_build_data/test_data.parquet"
def ingest_arguments() -> argparse.Namespace:
"""
Helper function to take in arguments from script start
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
args = parser.parse_args()
return args
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
df = df.drop(columns=[drop_columns])
return df
def retain_features(df: pd.DataFrame, features: List[str] = None):
"""
Determine which columns to keep ofr modelling
"""
if features is None:
features = df.columns
else:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
def process(self, df: pd.DataFrame) -> pd.DataFrame:
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
df = self.retain_features(df, features=FEATURE_COLUMNS)
return df
def training(train_filepath: str, test_filepath: str) -> None:
"""
Pipeline to run training on the dataset
"""
logger.info('Loading data')
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
logger.info('Feature processing')
feature_processor = FeatureProcessor()
train_df = feature_processor.process(train_df)
test_df = feature_processor.process(test_df)
# logger.info('Split data into train and validation')
logger.info('Build Model')
data = TabularDataset(data=train_df)
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
subsample_size = round(len(data)/4)
data = data.sample(subsample_size, random_state=RANDOM_SEED)
target_column = 'RDSAP_CHANGE'
predictor_RDSAP = TabularPredictor(
label=target_column,
path="agModels-predictRDSAP",
problem_type="regression",
eval_metric='mean_absolute_error'
).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN'])
logger.info('Evaluate matrics')
test_data = TabularDataset('./model_build_data/test_data.parquet')
performance = predictor_RDSAP.evaluate(test_data)
predictions = predictor_RDSAP.predict(test_data)
test_data['predictions'] = predictions
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
if __name__ == "__main__":
logger.info('---Begin Pipeline---')
logger.info('---Ingest Arguments---')
args = ingest_arguments()
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)