import os import pandas as pd import argparse from typing import List from Logger import logger from DataLoader import DataLoader from autogluon.tabular import TabularDataset, TabularPredictor DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE'] FEATURE_COLUMNS = None RANDOM_SEED = 0 # FOR TESTING train_filepath = "./model_build_data/train_validation_data.parquet" test_filepath = "./model_build_data/test_data.parquet" def ingest_arguments() -> argparse.Namespace: """ Helper function to take in arguments from script start """ parser = argparse.ArgumentParser(description='Inputs for training script') parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training') parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing') args = parser.parse_args() return args class FeatureProcessor: """ Handle all feature manipulation before modelling """ @staticmethod def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame: df = df.drop(columns=[drop_columns]) return df def retain_features(df: pd.DataFrame, features: List[str] = None): """ Determine which columns to keep ofr modelling """ if features is None: features = df.columns else: if not set(features).issubset(df.columns): logger.error('Features defined is not contained in data') exit(1) df = df[features] return df def process(self, df: pd.DataFrame) -> pd.DataFrame: df = self.drop_columns(df, drop_columns=DROP_COLUMNS) df = self.retain_features(df, features=FEATURE_COLUMNS) return df def training(train_filepath: str, test_filepath: str) -> None: """ Pipeline to run training on the dataset """ logger.info('Loading data') dataloader = DataLoader() train_df = dataloader.load(filepath=train_filepath) test_df = dataloader.load(filepath=test_filepath) # df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE']) logger.info('Feature processing') feature_processor = FeatureProcessor() train_df = feature_processor.process(train_df) test_df = feature_processor.process(test_df) # logger.info('Split data into train and validation') logger.info('Build Model') data = TabularDataset(data=train_df) # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float) subsample_size = round(len(data)/4) data = data.sample(subsample_size, random_state=RANDOM_SEED) target_column = 'RDSAP_CHANGE' predictor_RDSAP = TabularPredictor( label=target_column, path="agModels-predictRDSAP", problem_type="regression", eval_metric='mean_absolute_error' ).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN']) logger.info('Evaluate matrics') test_data = TabularDataset('./model_build_data/test_data.parquet') performance = predictor_RDSAP.evaluate(test_data) predictions = predictor_RDSAP.predict(test_data) test_data['predictions'] = predictions test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions']) if __name__ == "__main__": logger.info('---Begin Pipeline---') logger.info('---Ingest Arguments---') args = ingest_arguments() training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)