added mlmodel, prediction and training files

2026-08-03 05:18:22 +00:00 · 2023-08-17 16:07:22 +01:00 · 2023-08-17 16:07:22 +01:00 · 2a18180c53
commit 2a18180c53
parent a90a1278a8
15 changed files with 529 additions and 237 deletions
--- a/model_data/simulation_system/MLModel.py
+++ b/model_data/simulation_system/MLModel.py
@ -1,113 +0,0 @@
-"""
-MLModel class
-Key tasks:
- Template Model class for different model types
- Save model
- Load Model 
- Generate Inference
-"""
-
-from pathlib import Path
-from typing import Protocol, NamedTuple
-import pandas as pd
-from autogluon import TabularPredictor
-
-AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
-
-class MLModel(Protocol):
-    '''
-    Base ML Model protocol
-    '''
-
-    def load_model(self, filepath: Path) -> None:
-        """
-        Providing a path, this function will load the model to be used. Will load to internal variable
-        """
-
-    def save_model(self, output_filepath: Path) -> None:
-        """
-        Providing a path, this function will save the model to be used.
-        """
-
-    def train_model(
-            self, 
-            data: pd.DataFrame,
-            target: str,
-            hyperparameter: dict) -> None:
-        """
-        For the given data and hyperparameters (specified to the model), a model is trained
-        """
-
-    def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
-        """
-        For the given dataframe, model is loaded and predictions are generated
-        """
-
-    def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
-        """
-        For any validation data, a set of predictions and metrics are return
-        """
-
-class AutogluonModel(MLModel):
-    """
-    Autogluon model that implements the MLModel Protocol
-    """
-    def __init__(self) -> None:
-        self.model = None
-
-    def load_model(self, filepath: Path) -> None:
-        """
-        Providing a path, this function will load the model to be used. Will load to internal variable
-        """
-        self.model = TabularPredictor.load(path=filepath)
-        
-
-    def save_model(self, output_filepath: Path) -> None:
-        """
-        Providing a path, this function will save the model to be used.
-        """
-
-    def train_model(
-            self, 
-            data: pd.DataFrame, 
-            target_column: str, 
-            hyperparameters: dict = None) -> None:
-        """
-        For the given data and hyperparameters, a model is trained
-        """
-
-        if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
-            print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
-            exit(1)
-
-        self.model = TabularPredictor(
-            label=target_column, 
-            path=hyperparameters['output_path'], 
-            problem_type=hyperparameters['problem_type'],
-            eval_metric=hyperparameters['eval_metric']
-            ).fit(
-            data, 
-            time_limit=hyperparameters['time_limit'], 
-            presets=hyperparameters['presets'], 
-            excluded_model_types=hyperparameters['excluded_model_types']
-            )
-
-
-    def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
-        """
-        For the given dataframe, model is loaded and predictions are generated
-        """
-
-        if self.model is None:
-            print("No model loaded/ trained")
-            exit(1)
-
-        predictions = self.model.predict(data)
-
-        return predictions
-
-    def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
-        """
-        For any validation data, a set of predictions and metrics are return
-        """
-
--- a/model_data/simulation_system/MLModel/BaseMLModel.py
+++ b/model_data/simulation_system/MLModel/BaseMLModel.py
@ -0,0 +1,56 @@
+"""
+BaseMLModel class
+This is the base protocol:
+- Any implementation will be its own seperate file
+Key tasks:
+- Template Model class for different model types
+- Save model
+- Load Model 
+- Generate Inference
+"""
+
+from pathlib import Path
+from typing import Protocol, NamedTuple
+import pandas as pd
+
+
+class MLModel(Protocol):
+    '''
+    Base ML Model protocol
+    '''
+
+    def load_model(self, filepath: Path) -> None:
+        """
+        Providing a path, this function will load the model to be used. Will load to internal variable
+        """
+
+    def save_model(self, output_filepath: Path) -> None:
+        """
+        Providing a path, this function will save the model to be used.
+        """
+
+    def train_model(
+            self, 
+            data: pd.DataFrame,
+            target_column: str,
+            hyperparameter: dict
+            ) -> None:
+        """
+        For the given data and hyperparameters (specified to the model), a model is trained
+        """
+
+    def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
+        """
+        For the given dataframe, model is loaded and predictions are generated
+        """
+
+    def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple:
+        """
+        For any validation data, a set of predictions and metrics are return
+        """
+
+    def optimise_model_for_deployment(self):
+        """
+        Perfomance post processing on Model to ensure ready for deployment
+        """
+
--- a/model_data/simulation_system/MLModel/Models.py
+++ b/model_data/simulation_system/MLModel/Models.py
@ -0,0 +1,140 @@
+"""
+Different implementations of the MLModel Protocol 
+Uses the BaseMLModel protocol
+Key tasks:
+- Template Model class for different model types
+- Save model
+- Load Model 
+- Generate Inference
+"""
+
+from typing import NamedTuple
+from pathlib import Path
+import pandas as pd
+from autogluon.tabular import TabularDataset, TabularPredictor
+from sklearn.metrics import mean_absolute_percentage_error
+from core.Logger import logger
+
+AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
+METRIC_FILENAME = "metrics.csv"
+
+class AutogluonModel:
+    """
+    Autogluon model that implements the MLModel Protocol
+    """
+    def __init__(self, output_filepath: Path = None) -> None:
+        self.model = None
+        self.output_filepath = output_filepath
+        self.predictions = None
+
+    def load_model(self, filepath: Path) -> None:
+        """
+        Providing a path, this function will load the model to be used. Will load to internal variable
+        """
+        self.model = TabularPredictor.load(path=filepath)
+
+    def save_model(self, output_filepath: Path = None) -> None:
+        """
+        Providing a path, this function will save the model to be used.
+        """
+        logger.info("Using AutoGluon Model - Model saving already occured")
+
+    def train_model(
+            self, 
+            data: pd.DataFrame, 
+            target_column: str, 
+            hyperparameters: dict = None) -> None:
+        """
+        For the given data and hyperparameters, a model is trained
+        """
+        if self.output_filepath is None:
+            logger.error("Please specify a output_filepath in order to train a model")
+            exit(1)
+
+        if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
+            print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
+            exit(1)
+
+        AGdata = TabularDataset(data=data)
+
+        self.model = TabularPredictor(
+            label=target_column, 
+            path=self.output_filepath, 
+            problem_type=hyperparameters['problem_type'],
+            eval_metric=hyperparameters['eval_metric']
+            ).fit(
+            AGdata, 
+            time_limit=hyperparameters['time_limit'], 
+            presets=hyperparameters['presets'], 
+            excluded_model_types=hyperparameters['excluded_model_types']
+            )
+
+
+    def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
+        """
+        For the given dataframe, model is loaded and predictions are generated
+        """
+
+        if self.model is None:
+            print("No model loaded/ trained")
+            exit(1)
+
+        predictions = self.model.predict(data)
+
+        return predictions
+
+    def model_evaluation(
+            self, 
+            validation_data: pd.DataFrame, 
+            target_column: str, 
+            metrics_location: Path = None, 
+            metric_filename: str = METRIC_FILENAME
+            ) -> pd.DataFrame:
+        """
+        For any validation data, a set of predictions and metrics are return
+        """
+        if metrics_location is None:
+            logger.warning("Metrics will be outputted to current folder")
+
+        if self.model is None:
+            logger.error("No model loaded/ trained - Unable to generate evaluation")
+            exit(1)
+
+        performance = self.model.evaluate(validation_data)
+        predictions = self.generate_predictions(validation_data)
+
+        logger.info("Prediction used for evaluations are saved in self.prediction")
+        self.predictions = predictions
+        
+        # TODO: Can have a custom metric class that defines all different metrics we want 
+        metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
+
+        performance['mape'] = metric_mape
+
+        logger.info("Saving metric file as metric.csv")
+        metrics_location.mkdir(exist_ok=True)
+
+        metrics_df = pd.DataFrame([performance])
+        metrics_df.to_csv(metrics_location / metric_filename)
+
+        return metrics_df
+
+    def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
+        """
+        We can optimise the deployment for a autogluon model
+        """
+        if self.model is None:
+            logger.error("No model to optimise for deployment")
+            exit(1)
+
+        if deployment_path is None:
+            logger.error("Deployment path required")
+            exit(1)
+
+        # This will return a string path of the location
+        return self.model.clone_for_deployment(deployment_path)
+
+        
+
+
+        
--- a/model_data/simulation_system/MLModel/init.py
+++ b/model_data/simulation_system/MLModel/init.py
--- a/model_data/simulation_system/core/DataLoader.py
+++ b/model_data/simulation_system/core/DataLoader.py
@ -0,0 +1,21 @@
+import pandas as pd
+from core.Logger import logger
+
+class DataLoader():
+
+    @staticmethod
+    def load(filepath: str, index_col: str = None) -> pd.DataFrame:
+        """
+        Load different datasets
+        """
+        if filepath.endswith('.parquet'):
+            df = pd.read_parquet(filepath)
+            if index_col is not None:
+                df = df.set_index(index_col)
+        elif filepath.endswith('.csv'):
+            df = pd.read_csv(filepath, index_col=index_col)
+        else:
+            logger.error('Not implemented!')
+            exit(1)
+
+        return df
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -2,7 +2,7 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
 from model_data.BaseUtility import BaseUtility
-from simulation_system.Settings import (
+from simulation_system.core.Settings import (
    DATA_PROCESSOR_SETTINGS,
    EARLIEST_EPC_DATE,
    FULLY_GLAZED_DESCRIPTIONS,
--- a/model_data/simulation_system/core/FeatureProcessor.py
+++ b/model_data/simulation_system/core/FeatureProcessor.py
@ -0,0 +1,70 @@
+"""
+Create additional features from the dataset
+"""
+
+import pandas as pd
+from typing import List
+from core.Logger import logger
+
+RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
+HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
+
+RANDOM_SEED = 0 
+   
+class FeatureProcessor:
+    """
+    Handle all feature manipulation before modelling
+    """
+
+    @staticmethod
+    def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame:
+        """
+        Remove the unused columns for RDS
+        """
+        if target_column == "RDSAP_CHANGE":
+            df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS)
+        elif target_column == "HEAT_DEMAND_CHANGE":
+            df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS)
+        return df
+
+    @staticmethod
+    def retain_features(df: pd.DataFrame, features: List[str] = None):
+        """
+        Determine which columns to keep for modelling
+        """
+        if features is None:
+            features = df.columns
+        else:
+            if not set(features).issubset(df.columns):
+                logger.error('Features defined is not contained in data')
+                exit(1)
+        
+        df = df[features]
+
+        return df
+    
+    @staticmethod
+    def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
+        """
+        Sample data to reduce number of rows for model building if needed
+        """
+
+        if subsample_amount:
+            df = df.sample(subsample_amount, random_state=RANDOM_SEED)
+        return df
+
+    
+    def process(
+            self, 
+            df: pd.DataFrame, 
+            target_column: str = "RDSAP_CHANGE", 
+            features: List[str] = None,
+            subsample_amount: int = None
+            ) -> pd.DataFrame:
+        """
+        Pipeline to get data ready for building a model
+        """
+        df = self.subsample_data(df, subsample_amount=subsample_amount)
+        df = self.drop_unused_columns(df, target_column=target_column)
+        df = self.retain_features(df, features=features)
+        return df
--- a/model_data/simulation_system/core/Logger.py
+++ b/model_data/simulation_system/core/Logger.py
@ -1,3 +1,7 @@
+"""
+Logger that will be used throughout the application
+"""
+
 import logging 

 def setup_logger():
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -1,5 +1,18 @@
 # Using a simply python file as settings for now 
 # TODO: migrate to dynaconf
+from pathlib import Path
+
+RANDOM_SEED = 0
+
+TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
+TEST_DATA_NAME = 'test_data.parquet'
+
+REGISTRY_FILE = "model_registry.csv"
+MODEL_DIRECTORY = "model_directory"
+REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY / REGISTRY_FILE
+PREDICTION_LOCATION = Path("predictions")
+PREDICTION_FILE = 'prediction.json'
+METADATA_FILE = 'metadata.json'

 TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
--- a/model_data/simulation_system/core/init.py
+++ b/model_data/simulation_system/core/init.py
--- a/model_data/simulation_system/energy_predictor.py
+++ b/model_data/simulation_system/energy_predictor.py
@ -1,5 +1,5 @@
 from pathlib import Path
-from Settings import (
+from core.SettingsSettings import (
    RDSAP_RESPONSE, 
    FLOOR_LEVEL_MAP, 
    BUILT_FORM_REMAP,
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -1,23 +1,22 @@
 import numpy as np
 import pandas as pd
 from tqdm import tqdm
-from model_data.BaseUtility import BaseUtility
 from pathlib import Path
-from model_data.simulation_system.Settings import (
+from core.Settings import (
    MANDATORY_FIXED_FEATURES,
    AVERAGE_FIXED_FEATURES, 
    LATEST_FIELD, 
    COMPONENT_FEATURES, 
    RDSAP_RESPONSE,
    HEAT_DEMAND_RESPONSE,
-    COLUMNS_TO_MERGE_ON,
-    FLOOR_LEVEL_MAP,
-    BUILT_FORM_REMAP
+    COLUMNS_TO_MERGE_ON
 )
-from DataProcessor import DataProcessor
+from core.DataProcessor import DataProcessor

 DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'

+# TODO: Have a look at temporal features
+
 def app():
    # Get all the files in the directory

@ -77,9 +76,6 @@ def app():
                    if abs(vals[0] - vals[1]) / vals[0] > 0.1:
                        # Take the more recent value since it's likely to be more accurate
                        vals = [vals[-1]]
-
-                if len(vals) == 0:
-                    wrong_var
         
                fixed_data[field] = np.mean(vals)

--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@ -2,63 +2,127 @@
 Script to load MLModel class and generate predictions
 """

-from Logger import logger
-from MLModel import AutogluonModel
-from DataLoader import DataLoader
+import json
+import argparse
+from MLModel.Models import AutogluonModel
+from core.Logger import logger
+from core.DataLoader import DataLoader
 from pathlib import Path
 import pandas as pd
 from typing import Optional
+from datetime import datetime
+from core.Settings import (
+    REGISTRY_PATH,
+    PREDICTION_LOCATION,
+    PREDICTION_FILE,
+    METADATA_FILE
+)

-# These will be provided in some configuration setup
-HYPERPARAMETERS = {
-    'problem_type': 'regression',
-    'output_path': 'agModels-predictRDSAP',
-    'eval_metric': 'mean_absolute_error', 
-    'time_limit': 8000, 
-    'presets': 'best_quality', 
-    'excluded_model_types': ['KNN']
+TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")

-}
+# FOR TESTING
+# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
+# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
+# DATA = TEST_DATA.sample(1)

-def main(model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
+
+def ingest_arguments() -> argparse.Namespace:
+    """
+    Helper function to take in arguments from script start
+    """
+
+    parser = argparse.ArgumentParser(description='Inputs for training script')
+    parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
+    parser.add_argument('--data', type=str, help='Location of Parquet dataset to load for testing')
+    parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
+
+    args = parser.parse_args()
+
+    return args
+            
+
+
+def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
    """
    Main pipeline function
    """

-    if model_path is None:
-        logger.error("No model path provided")
+    if registry_path is None:
+        logger.error("No registry path provided")
        exit(1)

+    if model_path is not None:
+        logger.info("User specified a model to load - ignoring registry")
+        model_location = model_path
+        model_type = model_path
+        model_name = model_path
+    else:
+        # TODO: Think about where registry will sit/ type
+        logger.info("Loading best model from registry")
+        registry_df = pd.read_csv(registry_path)
+        best_model_df = registry_df[registry_df['best_model']]
+
+        model_location = best_model_df['model_location'].values[0]
+        model_type = best_model_df['model_type'].values[0]
+        model_name = best_model_df['model_name'].values[0]
+
+    logger.info("--- Model Info: ---")
+    logger.info(f"Model type: {model_type}")
+    logger.info(f"Model name: {model_name}")
+    logger.info(f"Model location: {model_location}")
+
+    logger.info("--- Loading Data ---")
    if data is None and data_path is None:
        logger.error("No Data/Data Path passed")
        exit(1)
-
    if data_path and data is None:
-        logger.info("--- Loading Data ---")
-        data = DataLoader().load()
+        logger.info("Loading data from provided path")
+        data = DataLoader().load(filepath=data_path, index_col="UPRN")
+
+        # TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION
+        data = data.sample(1)
    else:
-        logger.warning('Ignoring data_path and loading data provided')
+        logger.info('Using data provided')
+        data = json.loads(data)
+        data = pd.DataFrame([data])
+        print(data)

    logger.info("--- Loading Model ---")
    model = AutogluonModel()
-    model.load_model(filepath=model_path)
-
-    # model.train_model(
-    #     data=data, 
-    #     target_column='RDSAP_CHANGE', 
-    #     hyperparameters=HYPERPARAMETERS
-    #     )
+    model.load_model(filepath=model_location)

    logger.info("--- Generating Predictions ---")
    prediction = model.generate_predictions(data=data)

    # Save prediction some where?
-    prediction.to_csv("s3?")
+    # prediction.to_csv("s3?")

+    # TODO: Check how we want to structure outputs
+    # For now, just categorise by uprn and timestamp
+    # Assume one uprn coming in for now
+    uprn = data.index.values[0]
+
+    # Saving prediction local for now
+    logger.info("--- Outputting prediction and metadata --- ")
+    output_base = PREDICTION_LOCATION / uprn / TIMESTAMP
+    output_base.mkdir(parents=True, exist_ok=True)
+
+    json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
+    prediction_metadata = {
+        "model_type": model_type,
+        "model_name": model_name,
+        "model_location": model_location,
+        "model_settings": model.model.info()
+    }
+
+    pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE)
+
+    return json_prediction

 if __name__ == "__main__":
-    # For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
-    data = DataLoader.load(filepath="../simulation_system/preprocessed_data/dataset.parquet")
-    data_for_prediction = data.sample(1)

-    main(filepath="", data=data_for_prediction)
+    args = ingest_arguments()
+
+    # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
+    # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
+    prediction(registry_path=REGISTRY_PATH, model_path=args.model_path, data=args.data, data_path=args.data_path)
--- a/model_data/simulation_system/test_data_generation.py
+++ b/model_data/simulation_system/test_data_generation.py
@ -1,9 +1,12 @@
-from Logger import logger
+from core.Logger import logger
 import argparse
 import pandas as pd
 from pathlib import Path
-
-RANDOM_SEED = 0
+from core.Settings import (
+    RANDOM_SEED,
+    TRAIN_AND_VALIDATION_DATA_NAME,
+    TEST_DATA_NAME
+)

 def ingest_arguments() -> argparse.Namespace:
    """
@ -56,8 +59,8 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp

    logger.info('--- Saving data ---')

-    train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
-    test_data.to_parquet(Path(output_folder)/'test_data.parquet')
+    train_validation_data.to_parquet(Path(output_folder)/ TRAIN_AND_VALIDATION_DATA_NAME)
+    test_data.to_parquet(Path(output_folder)/ TEST_DATA_NAME)

    logger.info(' ---Pipeline complete---')

--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@ -1,19 +1,37 @@
-import os
-import pandas as pd
+
 import argparse
+from pathlib import Path
+from datetime import datetime
 from typing import List
-from Logger import logger
-from DataLoader import DataLoader
-from autogluon.tabular import TabularDataset, TabularPredictor
+from core.Logger import logger
+from core.DataLoader import DataLoader
+from core.FeatureProcessor import FeatureProcessor
+from MLModel.Models import AutogluonModel
+import pandas as pd
+from core.Settings import (
+    MODEL_DIRECTORY,
+    REGISTRY_PATH
+)

+TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")

-DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
-FEATURE_COLUMNS = None
-RANDOM_SEED = 0
+# Can move to a hyperparmeters file
+# If anything we might want to have a file that can be loaded and sent to this script
+HYPERPARAMETERS = {
+    'problem_type': 'regression',
+    'eval_metric': 'mean_absolute_error', 
+    'time_limit': 60, 
+    'presets': 'medium_quality', 
+    'excluded_model_types': None
+}

 # FOR TESTING
-train_filepath = "./model_build_data/train_validation_data.parquet"
-test_filepath = "./model_build_data/test_data.parquet"
+train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
+test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
+target_column = "RDSAP_CHANGE"
+model_type = "autogluon"
+hyperparameter = HYPERPARAMETERS
+subsample_factor = 200


 def ingest_arguments() -> argparse.Namespace:
@ -23,98 +41,112 @@ def ingest_arguments() -> argparse.Namespace:

    parser = argparse.ArgumentParser(description='Inputs for training script')

-    parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
-    parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
+    parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
+    parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
+    parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
+    parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE"], default='RDSAP_CHANGE')

    args = parser.parse_args()

    return args
-    
-class FeatureProcessor:
-    """
-    Handle all feature manipulation before modelling
-    """
-    
-    @staticmethod
-    def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
-        df = df.drop(columns=[drop_columns])
-        return df
-
-    def retain_features(df: pd.DataFrame, features: List[str] = None):
-        """
-        Determine which columns to keep ofr modelling
-        """
-        if features is None:
-            features = df.columns
-        else:
-            if not set(features).issubset(df.columns):
-                logger.error('Features defined is not contained in data')
-                exit(1)
-        
-        df = df[features]
-
-        return df
-    
-    def process(self, df: pd.DataFrame) -> pd.DataFrame:
-        df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
-        df = self.retain_features(df, features=FEATURE_COLUMNS)
-        return df
-
            

-def training(train_filepath: str, test_filepath: str) -> None:
+def training(
+        train_filepath: str, 
+        test_filepath: str, 
+        target_column: str = "RDSAP_CHANGE", 
+        model_type: str = "autogluon", 
+        hyperparameter: dict = HYPERPARAMETERS
+        ) -> None:
    """
    Pipeline to run training on the dataset
    """

-    logger.info('Loading data')
+    logger.info('--- Loading data ---')
    dataloader = DataLoader()
    train_df = dataloader.load(filepath=train_filepath)
    test_df = dataloader.load(filepath=test_filepath)
-
-    # df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
 
-    logger.info('Feature processing')
+    logger.info('--- Feature processing ---')
+
    feature_processor = FeatureProcessor()
-    train_df = feature_processor.process(train_df)
-    test_df = feature_processor.process(test_df)

-    # logger.info('Split data into train and validation')
+    subsample_amount = round(len(train_df)/subsample_factor)

-    logger.info('Build Model')
+    train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
+    test_df = feature_processor.process(test_df, target_column=target_column)
+
+    logger.info('--- Build Model ---')
+    if model_type == "autogluon":
+        model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
+        output_base = Path(MODEL_DIRECTORY) / model_type / model_root 
+
+        model_folder = "model"
+        metrics_folder = "metrics"
+
+        model = AutogluonModel(
+            output_filepath = output_base / model_folder
+            )
+    else:
+        logger.error("No alternative model implemented yet")
+        exit(1)
    
-    data = TabularDataset(data=train_filepath)
-    data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
-    TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
-    # top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
+    model.train_model(
+        data=train_df, 
+        target_column=target_column, 
+        hyperparameters=hyperparameter
+        )
+    
+    logger.info("--- Save Model ---")
+    model.save_model(output_filepath=model.output_filepath)

-    data = data[['RDSAP_CHANGE'] + top_features.to_list()]
-    # data = TabularDataset(data=train_df)
-    # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
-    subsample_size = round(len(data)/20)
-    data = data.sample(subsample_size, random_state=RANDOM_SEED)
+    logger.info('--- Generate evaluation metrics ---')
+    metrics_df = model.model_evaluation(
+        validation_data=test_df, 
+        target_column=target_column,
+        metrics_location = output_base / metrics_folder
+        )
+    
+    # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
+    # Imagining for now that the model trained here is the best model amongst all models built

-    # Add custom metric class MAPE
-    # Have a look at temporal features
+    logger.info("--- Optimising model for deployment ---")
+    optimised_folder = "deployment"        
+    deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / optimised_folder)
+    logger.info("Optimised version of best model can be found at: {deployment_model_path}")

-    target_column = 'RDSAP_CHANGE'
-    predictor_RDSAP = TabularPredictor(
-        label=target_column, 
-        path="agModels-predictRDSAP", 
-        problem_type="regression",
-        eval_metric='mean_absolute_error'
-        ).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
+    # TODO: Need a model registry - for now have this as a CSV
+    # Save this in the model directory
+    logger.info("--- Append registry with new model ---")
+    
+    if REGISTRY_PATH.exists():
+        logger.info("Registry file found - Loading into Dataframe")
+        registry_df = pd.read_csv(REGISTRY_PATH, index_col=None)
+    else:
+        registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])

+    model_details_df = pd.DataFrame(
+        [{
+            'model_type': model_type, 
+            'model_name': model_root, 
+            'model_location': deployment_model_path
+        }]
+        )
+    
+    registry_row = pd.concat([model_details_df, metrics_df], axis=1)
+    registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)

+    # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
+    # TODO: decide metric to optimise to
+    registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
+    registry_df['best_model'] = [False]*len(registry_df)
+    registry_df.loc[0, 'best_model'] = True

-    logger.info('Evaluate matrics')
+    logger.info("--- Saving new model to registry ---")
+    registry_df.to_csv(REGISTRY_PATH, index=False)

-    test_data = TabularDataset('./model_build_data/test_data.parquet')
-    performance = predictor_RDSAP.evaluate(test_data)
-    predictions = predictor_RDSAP.predict(test_data)
+    logger.info("--- Training Pipeline Complete --- ")

-    test_data['predictions'] = predictions
-    test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])

 if __name__ == "__main__":

@ -123,4 +155,10 @@ if __name__ == "__main__":
    logger.info('---Ingest Arguments---')
    args = ingest_arguments()

-    training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)
+    # TODO: Ingest hyper parameters from somewhere - currently change at the top of script
+    training(
+        train_filepath=args.train_filepath, 
+        test_filepath=args.test_filepath, 
+        target_column=args.target_column, 
+        model_type=args.model_type
+        )