diff --git a/.github/workflows/cml.yml b/.github/workflows/cml.yml new file mode 100644 index 00000000..bf361dd3 --- /dev/null +++ b/.github/workflows/cml.yml @@ -0,0 +1,38 @@ +name: model-training +on: + push: + branches: + - mlmodel +permissions: write-all +jobs: + run: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + - uses: iterative/setup-cml@v1 + - name: Train model + env: + REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + ls + cd model_data/simulation_system + pip install -r requirements.txt + python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet + + cd model_directory/RDSAP_CHANGE + echo "## Model metrics" > report.md + metrics_location=$(find . -maxdepth 10 -name "metrics.md") + echo $metrics_location + cat $metrics_location >> report.md + + # echo "## Residuals plot from model" >> report.md + # metrics_location=$(find . -maxdepth 10 -name "residuals.png") + # echo $metrics_location + # cd $metric_location + # echo "![](./residuals.png)" >> report.md + + cml comment create report.md + + # cml comment create --log debug --publish false report.md + diff --git a/model_data/simulation_system/MLModel/BaseMLModel.py b/model_data/simulation_system/MLModel/BaseMLModel.py new file mode 100644 index 00000000..42106a33 --- /dev/null +++ b/model_data/simulation_system/MLModel/BaseMLModel.py @@ -0,0 +1,59 @@ +""" +BaseMLModel class +This is the base protocol: +- Any implementation will be its own seperate file +Key tasks: +- Template Model class for different model types +- Save model +- Load Model +- Generate Inference +""" + +from pathlib import Path +from typing import Protocol, NamedTuple +import pandas as pd + + +class MLModel(Protocol): + ''' + Base ML Model protocol + ''' + + def load_model(self, filepath: Path) -> None: + """ + Providing a path, this function will load the model to be used. Will load to internal variable + """ + + def save_model(self, output_filepath: Path) -> None: + """ + Providing a path, this function will save the model to be used. + """ + + def train_model( + self, + data: pd.DataFrame, + target_column: str, + hyperparameter: dict + ) -> None: + """ + For the given data and hyperparameters (specified to the model), a model is trained + """ + + def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame: + """ + For the given dataframe, model is loaded and predictions are generated + """ + + def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple: + """ + For any validation data, a set of predictions and metrics are return + """ + + def optimise_model_for_deployment(self): + """ + Perfomance post processing on Model to ensure ready for deployment + """ + + def generate_meta_data(self): + """ + """ diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py new file mode 100644 index 00000000..137f2f20 --- /dev/null +++ b/model_data/simulation_system/MLModel/Models.py @@ -0,0 +1,142 @@ +""" +Different implementations of the MLModel Protocol +Uses the BaseMLModel protocol +Key tasks: +- Template Model class for different model types +- Save model +- Load Model +- Generate Inference +""" + +from typing import NamedTuple +from pathlib import Path +import pandas as pd +from autogluon.tabular import TabularDataset, TabularPredictor +from sklearn.metrics import mean_absolute_percentage_error +from core.Logger import logger + +AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types'] +METRIC_FILENAME = "metrics.csv" + +class AutogluonModel: + """ + Autogluon model that implements the MLModel Protocol + """ + def __init__(self, output_filepath: Path = None) -> None: + self.model = None + self.output_filepath = output_filepath + self.predictions = None + + def load_model(self, filepath: Path) -> None: + """ + Providing a path, this function will load the model to be used. Will load to internal variable + """ + self.model = TabularPredictor.load(path=filepath) + + def save_model(self, output_filepath: Path = None) -> None: + """ + Providing a path, this function will save the model to be used. + """ + logger.info("Using AutoGluon Model - Model saving already occured") + + def train_model( + self, + data: pd.DataFrame, + target_column: str, + hyperparameters: dict = None) -> None: + """ + For the given data and hyperparameters, a model is trained + """ + if self.output_filepath is None: + logger.error("Please specify a output_filepath in order to train a model") + exit(1) + + if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()): + print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required") + exit(1) + + AGdata = TabularDataset(data=data) + + self.model = TabularPredictor( + label=target_column, + path=self.output_filepath, + problem_type=hyperparameters['problem_type'], + eval_metric=hyperparameters['eval_metric'] + ).fit( + AGdata, + time_limit=hyperparameters['time_limit'], + presets=hyperparameters['presets'], + excluded_model_types=hyperparameters['excluded_model_types'] + ) + + + def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame: + """ + For the given dataframe, model is loaded and predictions are generated + """ + + if self.model is None: + print("No model loaded/ trained") + exit(1) + + predictions = self.model.predict(data) + + return predictions + + def model_evaluation( + self, + validation_data: pd.DataFrame, + target_column: str, + metrics_location: Path = None, + metric_filename: str = METRIC_FILENAME + ) -> pd.DataFrame: + """ + For any validation data, a set of predictions and metrics are return + """ + if metrics_location is None: + logger.warning("Metrics will be outputted to current folder") + + if self.model is None: + logger.error("No model loaded/ trained - Unable to generate evaluation") + exit(1) + + performance = self.model.evaluate(validation_data) + predictions = self.generate_predictions(validation_data) + + logger.info("Prediction used for evaluations are saved in self.prediction") + self.predictions = predictions + + # TODO: Can have a custom metric class that defines all different metrics we want + metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions) + + performance['mape'] = metric_mape + + logger.info("Saving metric file as metric.csv") + metrics_location.mkdir(exist_ok=True) + + metrics_df = pd.DataFrame([performance]) + metrics_df.to_csv(metrics_location / metric_filename) + markdown_filename = metric_filename.split(".")[0] + ".md" + metrics_df.to_markdown(metrics_location/ markdown_filename) + + return metrics_df + + def optimise_model_for_deployment(self, deployment_path: Path = None) -> None: + """ + We can optimise the deployment for a autogluon model + """ + if self.model is None: + logger.error("No model to optimise for deployment") + exit(1) + + if deployment_path is None: + logger.error("Deployment path required") + exit(1) + + # This will return a string path of the location + return self.model.clone_for_deployment(deployment_path) + + + + + \ No newline at end of file diff --git a/model_data/simulation_system/MLModel/__init__.py b/model_data/simulation_system/MLModel/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/model_data/simulation_system/Makefile b/model_data/simulation_system/Makefile new file mode 100644 index 00000000..97df8d3e --- /dev/null +++ b/model_data/simulation_system/Makefile @@ -0,0 +1,14 @@ +.PHONY: init +init: build docker + +.PHONY: build +build: + docker-compose build + +.PHONY: docker +docker: + docker-compose up -d + +.PHONY: down +down: + docker compose down \ No newline at end of file diff --git a/model_data/simulation_system/README.md b/model_data/simulation_system/README.md new file mode 100644 index 00000000..b6fe8327 --- /dev/null +++ b/model_data/simulation_system/README.md @@ -0,0 +1,66 @@ +# Simulation System + +Starter Readme: +Steps for pipeline: + +- (WIP) Use Makefile to start up mock up s3 service + - By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service + - This docker compose is running in detached mode `-d`, so will no output anything to the terminal + +- Once the Minio service is run, you can run the `training.py` file to start a model build process + - This will output a model, for a given target column, and add model name composed of some of the hyperparameters + - An example of running this file is: + - `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet` + - Outputs of the pipeline are: + - A model directory bucket + - A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE) + - A model type prefix (i.e. autogluon, tensorflow etc) + - A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP) + - This model name is made up of target variable, quality, time spent training and timestamp + - Within this prefix, there are three folders: + - model + - The model path that can be loaded in the codebase + - deployment + - The optimised model that can be deployed (may or maynot need this) + - metrics + - The metrics generatted from the model (may or may not need this as this can be contained in the registry) + +- Once model build is finished, you can run the `prediction.py` file to generate prediction + - By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry + - This can be overwritten by specifying a model_path, which will load an alternative model + - There are two ways of getting data into the pipeline: + - Using the `--data` argument: + - This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'` + - Note the single and double quotation marks, as this affects the ingestion + - Using the `--data-path` argument: + - This can be a filepath (Can imagine that we might want to pull data from S3/ DB) + - An example of running the file is: + - `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet` + - Outputs of the pipeline are: + - prediction bucket + - a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE) + - a uprn prefix (i.e 0123456789) + - a `prediction.json` + - a `metadata.json` + - This is all the metadata from the model (can change this if needed) + +- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file + - It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify. + - I.e. the hyperparameters for models are in here but will move into a separate configuration file + + +# TODO: +- Structure/ MLOps: + - Add configuration files (dev, staging, prod), including hyperparamters + - Add precommit hooks (linters, branch names, etc) + - Sphinx documentation + - Sort out local mock up services + - Sort out Model Registry + - Sort out Data version control +- Data Science: + - Implement a metrics class, to hold all metric + - Rebuild metrics script (Could be a one off but good to have) + - Determine metrics + - Implement and test custom model (Tensorflow Decision Trees etc) +- Orchestration: + - Lambda handler for the pipeline \ No newline at end of file diff --git a/model_data/simulation_system/core/DataLoader.py b/model_data/simulation_system/core/DataLoader.py new file mode 100644 index 00000000..1e811f8d --- /dev/null +++ b/model_data/simulation_system/core/DataLoader.py @@ -0,0 +1,21 @@ +import pandas as pd +from core.Logger import logger + +class DataLoader(): + + @staticmethod + def load(filepath: str, index_col: str = None) -> pd.DataFrame: + """ + Load different datasets + """ + if filepath.endswith('.parquet'): + df = pd.read_parquet(filepath) + if index_col is not None: + df = df.set_index(index_col) + elif filepath.endswith('.csv'): + df = pd.read_csv(filepath, index_col=index_col) + else: + logger.error('Not implemented!') + exit(1) + + return df \ No newline at end of file diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py similarity index 99% rename from model_data/simulation_system/DataProcessor.py rename to model_data/simulation_system/core/DataProcessor.py index 50abd8e3..1ac53517 100644 --- a/model_data/simulation_system/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -2,7 +2,7 @@ from pathlib import Path import numpy as np import pandas as pd from model_data.BaseUtility import Definitions -from simulation_system.Settings import ( +from simulation_system.core.Settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, FULLY_GLAZED_DESCRIPTIONS, diff --git a/model_data/simulation_system/core/FeatureProcessor.py b/model_data/simulation_system/core/FeatureProcessor.py new file mode 100644 index 00000000..aef9605f --- /dev/null +++ b/model_data/simulation_system/core/FeatureProcessor.py @@ -0,0 +1,70 @@ +""" +Create additional features from the dataset +""" + +import pandas as pd +from typing import List +from core.Logger import logger + +RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE'] +HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE'] + +RANDOM_SEED = 0 + +class FeatureProcessor: + """ + Handle all feature manipulation before modelling + """ + + @staticmethod + def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame: + """ + Remove the unused columns for RDS + """ + if target_column == "RDSAP_CHANGE": + df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS) + elif target_column == "HEAT_DEMAND_CHANGE": + df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS) + return df + + @staticmethod + def retain_features(df: pd.DataFrame, features: List[str] = None): + """ + Determine which columns to keep for modelling + """ + if features is None: + features = df.columns + else: + if not set(features).issubset(df.columns): + logger.error('Features defined is not contained in data') + exit(1) + + df = df[features] + + return df + + @staticmethod + def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame: + """ + Sample data to reduce number of rows for model building if needed + """ + + if subsample_amount: + df = df.sample(subsample_amount, random_state=RANDOM_SEED) + return df + + + def process( + self, + df: pd.DataFrame, + target_column: str = "RDSAP_CHANGE", + features: List[str] = None, + subsample_amount: int = None + ) -> pd.DataFrame: + """ + Pipeline to get data ready for building a model + """ + df = self.subsample_data(df, subsample_amount=subsample_amount) + df = self.drop_unused_columns(df, target_column=target_column) + df = self.retain_features(df, features=features) + return df diff --git a/model_data/simulation_system/Logger.py b/model_data/simulation_system/core/Logger.py similarity index 89% rename from model_data/simulation_system/Logger.py rename to model_data/simulation_system/core/Logger.py index 5197e7ce..8603fff6 100644 --- a/model_data/simulation_system/Logger.py +++ b/model_data/simulation_system/core/Logger.py @@ -1,3 +1,7 @@ +""" +Logger that will be used throughout the application +""" + import logging def setup_logger(): diff --git a/model_data/simulation_system/Settings.py b/model_data/simulation_system/core/Settings.py similarity index 78% rename from model_data/simulation_system/Settings.py rename to model_data/simulation_system/core/Settings.py index 1d302abf..e562a39b 100644 --- a/model_data/simulation_system/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -1,5 +1,34 @@ # Using a simply python file as settings for now # TODO: migrate to dynaconf +from pathlib import Path + +# Can move to a hyperparmeters file +# If anything we might want to have a file that can be loaded and sent to this script +MODEL_HYPERPARAMETERS = { + "autogluon": { + 'problem_type': 'regression', + 'eval_metric': 'mean_absolute_error', + 'time_limit': 30, + 'presets': 'medium_quality', + 'excluded_model_types': None + } +} + +RANDOM_SEED = 0 +SUBSAMPLE_FACTOR = 200 + +TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet' +TEST_DATA_NAME = 'test_data.parquet' + +REGISTRY_FILE = "model_registry.csv" +MODEL_DIRECTORY = "model_directory" +BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY +PREDICTION_LOCATION = Path("predictions") +PREDICTION_FILE = 'prediction.json' +METADATA_FILE = 'metadata.json' +MODEL_FOLDER = "model" +METRICS_FOLDER = "metrics" +DEPLOYMENT_FOLDER = "deployment" TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 diff --git a/model_data/simulation_system/core/__init__.py b/model_data/simulation_system/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/model_data/simulation_system/docker-compose.yml b/model_data/simulation_system/docker-compose.yml new file mode 100644 index 00000000..55f181bc --- /dev/null +++ b/model_data/simulation_system/docker-compose.yml @@ -0,0 +1,17 @@ +version: '3' + +services: + minio: + image: minio/minio + ports: + - "9000:9000" + - "9001:9001" + volumes: + - ./data:/data + environment: + MINIO_ROOT_USER: &MINIO_USER admin + MINIO_ROOT_PASSWORD: &MINIO_PASS password + command: server --console-address ":9001" /data + +# volumes: +# minio_storage: {} \ No newline at end of file diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py index 87ad3799..ab29f39c 100644 --- a/model_data/simulation_system/energy_predictor.py +++ b/model_data/simulation_system/energy_predictor.py @@ -1,5 +1,5 @@ from pathlib import Path -from Settings import ( +from core.Settings import ( RDSAP_RESPONSE, FLOOR_LEVEL_MAP, BUILT_FORM_REMAP, diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/generate_rdsap_change.py similarity index 95% rename from model_data/simulation_system/app.py rename to model_data/simulation_system/generate_rdsap_change.py index 9ac2c13d..2400e7c7 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -1,24 +1,24 @@ import numpy as np import pandas as pd from tqdm import tqdm -from model_data.BaseUtility import Definitions + from pathlib import Path -from model_data.simulation_system.Settings import ( +from core.Settings import ( MANDATORY_FIXED_FEATURES, AVERAGE_FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES, RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, - COLUMNS_TO_MERGE_ON, - FLOOR_LEVEL_MAP, - BUILT_FORM_REMAP + COLUMNS_TO_MERGE_ON ) -from DataProcessor import DataProcessor +from core.DataProcessor import DataProcessor DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' +# TODO: Have a look at temporal features + def app(): # Get all the files in the directory @@ -85,9 +85,6 @@ def app(): # Take the more recent value since it's likely to be more accurate vals = [vals[-1]] - if len(vals) == 0: - wrong_var - fixed_data[field] = np.mean(vals) # Combine all fields together diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py new file mode 100644 index 00000000..bc1b113b --- /dev/null +++ b/model_data/simulation_system/predictions.py @@ -0,0 +1,134 @@ +""" +Script to load MLModel class and generate predictions +""" + +import json +import argparse +from MLModel.Models import AutogluonModel +from core.Logger import logger +from core.DataLoader import DataLoader +from pathlib import Path +import pandas as pd +from typing import Optional +from datetime import datetime +from core.Settings import ( + BASE_REGISTRY_PATH, + REGISTRY_FILE, + PREDICTION_LOCATION, + PREDICTION_FILE, + METADATA_FILE +) + +TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") + +# FOR TESTING +# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame) +# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet") +# DATA = TEST_DATA.sample(1) + + +def ingest_arguments() -> argparse.Namespace: + """ + Helper function to take in arguments from script start + """ + + parser = argparse.ArgumentParser(description='Inputs for training script') + parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE') + parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here') + parser.add_argument('--data', type=str, help='Json data for predictions') + parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training') + + args = parser.parse_args() + + return args + + + +def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None): + """ + Main pipeline function + """ + + registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE + + if registry_path is None or not registry_path.exists(): + logger.error("No registry path provided or registry doesn't exist") + exit(1) + + if model_path is not None: + logger.info("User specified a model to load - ignoring registry") + model_location = model_path + model_type = model_path + model_name = model_path + else: + # TODO: Think about where registry will sit/ type + logger.info("Loading best model from registry") + registry_df = pd.read_csv(registry_path) + best_model_df = registry_df[registry_df['best_model']] + + model_location = best_model_df['model_location'].values[0] + model_type = best_model_df['model_type'].values[0] + model_name = best_model_df['model_name'].values[0] + + logger.info("--- Model Info: ---") + logger.info(f"Model type: {model_type}") + logger.info(f"Model name: {model_name}") + logger.info(f"Model location: {model_location}") + + logger.info("--- Loading Data ---") + if data is None and data_path is None: + logger.error("No Data/Data Path passed") + exit(1) + if data_path and data is None: + logger.info("Loading data from provided path") + data = DataLoader().load(filepath=data_path, index_col="UPRN") + + # TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION + data = data.sample(1) + else: + logger.info('Using data provided') + data = json.loads(data) + data = pd.DataFrame([data]) + print(data) + + logger.info("--- Loading Model ---") + model = AutogluonModel() + model.load_model(filepath=model_location) + + logger.info("--- Generating Predictions ---") + prediction = model.generate_predictions(data=data) + + # Save prediction some where? + # prediction.to_csv("s3?") + + # TODO: Check how we want to structure outputs + # For now, just categorise by uprn and timestamp + # Assume one uprn coming in for now + uprn = data.index.values[0] + + # Saving prediction local for now + # TODO: change uprn to TARGET_ID, put in setting + logger.info("--- Outputting prediction and metadata --- ") + output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP + output_base.mkdir(parents=True, exist_ok=True) + + # TODO: change model.model.info to a class method for MLModel + json_prediction = prediction.to_json(output_base / PREDICTION_FILE) + prediction_metadata = { + "model_type": model_type, + "model_name": model_name, + "model_location": model_location, + "model_settings": model.model.info() + } + + pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE) + + return json_prediction + +if __name__ == "__main__": + + args = ingest_arguments() + + # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' + # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet + prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path) \ No newline at end of file diff --git a/model_data/simulation_system/requirements.txt b/model_data/simulation_system/requirements.txt new file mode 100644 index 00000000..62db1719 --- /dev/null +++ b/model_data/simulation_system/requirements.txt @@ -0,0 +1,208 @@ +absl-py==1.4.0 +accelerate==0.16.0 +aiohttp==3.8.5 +aiohttp-cors==0.7.0 +aiosignal==1.3.1 +aliyun-python-sdk-core==2.13.36 +aliyun-python-sdk-kms==2.16.1 +antlr4-python3-runtime==4.9.3 +asttokens==2.2.1 +async-timeout==4.0.3 +attrs==23.1.0 +autogluon==0.8.2 +autogluon.common==0.8.2 +autogluon.core==0.8.2 +autogluon.features==0.8.2 +autogluon.multimodal==0.8.2 +autogluon.tabular==0.8.2 +autogluon.timeseries==0.8.2 +backcall==0.2.0 +beautifulsoup4==4.12.2 +blessed==1.20.0 +blis==0.7.10 +boto3==1.28.25 +botocore==1.31.25 +cachetools==5.3.1 +catalogue==2.0.9 +catboost==1.2 +certifi==2023.7.22 +cffi==1.15.1 +charset-normalizer==3.2.0 +click==8.1.6 +cloudpickle==2.2.1 +colorama==0.4.6 +colorful==0.5.5 +comm==0.1.4 +confection==0.1.1 +contourpy==1.1.0 +crcmod==1.7 +cryptography==41.0.3 +cycler==0.11.0 +cymem==2.0.7 +datasets==2.14.4 +debugpy==1.6.7 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.7 +distlib==0.3.7 +evaluate==0.3.0 +executing==1.2.0 +fastai==2.7.12 +fastcore==1.5.29 +fastdownload==0.0.7 +fastprogress==1.0.3 +filelock==3.12.2 +fonttools==4.42.0 +frozenlist==1.4.0 +fsspec==2023.6.0 +future==0.18.3 +gdown==4.7.1 +gluonts==0.13.3 +google-api-core==2.11.1 +google-auth==2.22.0 +google-auth-oauthlib==1.0.0 +googleapis-common-protos==1.60.0 +gpustat==1.1 +graphviz==0.20.1 +grpcio==1.50.0 +huggingface-hub==0.16.4 +hyperopt==0.2.7 +idna==3.4 +imageio==2.31.1 +ipykernel==6.25.1 +ipython==8.14.0 +jedi==0.19.0 +Jinja2==3.1.2 +jmespath==0.10.0 +joblib==1.3.2 +jsonschema==4.17.3 +jupyter_client==8.3.0 +jupyter_core==5.3.1 +kiwisolver==1.4.4 +langcodes==3.3.0 +lightgbm==3.3.5 +lightning-utilities==0.9.0 +llvmlite==0.40.1 +Markdown==3.4.4 +markdown-it-py==3.0.0 +MarkupSafe==2.1.3 +matplotlib==3.7.2 +matplotlib-inline==0.1.6 +mdurl==0.1.2 +mlforecast==0.7.3 +model-index==0.1.11 +msgpack==1.0.5 +multidict==6.0.4 +multiprocess==0.70.15 +murmurhash==1.0.9 +nest-asyncio==1.5.7 +networkx==3.1 +nlpaug==1.1.11 +nltk==3.8.1 +nptyping==2.4.1 +numba==0.57.1 +numpy==1.24.4 +nvidia-ml-py==12.535.77 +oauthlib==3.2.2 +omegaconf==2.2.3 +opencensus==0.11.2 +opencensus-context==0.1.3 +opendatalab==0.0.10 +openmim==0.3.9 +openxlab==0.0.17 +ordered-set==4.1.0 +oss2==2.17.0 +packaging==23.1 +pandas==1.5.3 +parso==0.8.3 +pathy==0.10.2 +patsy==0.5.3 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==9.5.0 +platformdirs==3.10.0 +plotly==5.16.0 +preshed==3.0.8 +prometheus-client==0.17.1 +prompt-toolkit==3.0.39 +protobuf==3.20.2 +psutil==5.9.5 +ptyprocess==0.7.0 +pure-eval==0.2.2 +py-spy==0.3.14 +py4j==0.10.9.7 +pyarrow==12.0.1 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycparser==2.21 +pycryptodome==3.18.0 +pydantic==1.10.12 +Pygments==2.16.1 +pyparsing==3.0.9 +pyrsistent==0.19.3 +PySocks==1.7.1 +pytesseract==0.3.10 +python-dateutil==2.8.2 +pytorch-lightning==1.9.5 +pytorch-metric-learning==1.7.3 +pytz==2023.3 +PyWavelets==1.4.1 +PyYAML==6.0.1 +pyzmq==25.1.0 +ray==2.3.1 +regex==2023.8.8 +requests==2.28.2 +requests-oauthlib==1.3.1 +responses==0.18.0 +rich==13.4.2 +rsa==4.9 +s3transfer==0.6.1 +safetensors==0.3.2 +scikit-image==0.19.3 +scikit-learn==1.2.2 +scipy==1.11.1 +seaborn==0.12.2 +sentencepiece==0.1.99 +seqeval==1.2.2 +six==1.16.0 +smart-open==6.3.0 +soupsieve==2.4.1 +spacy==3.6.1 +spacy-legacy==3.0.12 +spacy-loggers==1.0.4 +srsly==2.4.7 +stack-data==0.6.2 +statsforecast==1.4.0 +statsmodels==0.14.0 +tabulate==0.9.0 +tenacity==8.2.2 +tensorboard==2.14.0 +tensorboard-data-server==0.7.1 +tensorboardX==2.6.2 +text-unidecode==1.3 +thinc==8.1.12 +threadpoolctl==3.2.0 +tifffile==2023.7.18 +timm==0.9.5 +tokenizers==0.13.3 +toolz==0.12.0 +torch==1.13.1 +torchmetrics==0.11.4 +torchvision==0.14.1 +tornado==6.3.2 +tqdm==4.65.1 +traitlets==5.9.0 +transformers==4.26.1 +typer==0.9.0 +typing_extensions==4.7.1 +tzdata==2023.3 +ujson==5.8.0 +urllib3==1.26.16 +virtualenv==20.24.3 +wasabi==1.1.2 +wcwidth==0.2.6 +Werkzeug==2.3.6 +window-ops==0.0.14 +xgboost==1.7.6 +xxhash==3.3.0 +yarl==1.9.2 diff --git a/model_data/simulation_system/test_data_generation.py b/model_data/simulation_system/test_data_generation.py index fb7d7c64..d57c90f8 100644 --- a/model_data/simulation_system/test_data_generation.py +++ b/model_data/simulation_system/test_data_generation.py @@ -1,9 +1,12 @@ -from Logger import logger +from core.Logger import logger import argparse import pandas as pd from pathlib import Path - -RANDOM_SEED = 0 +from core.Settings import ( + RANDOM_SEED, + TRAIN_AND_VALIDATION_DATA_NAME, + TEST_DATA_NAME +) def ingest_arguments() -> argparse.Namespace: """ @@ -56,8 +59,8 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp logger.info('--- Saving data ---') - train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet') - test_data.to_parquet(Path(output_folder)/'test_data.parquet') + train_validation_data.to_parquet(Path(output_folder)/ TRAIN_AND_VALIDATION_DATA_NAME) + test_data.to_parquet(Path(output_folder)/ TEST_DATA_NAME) logger.info(' ---Pipeline complete---') diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index da2c6f4a..b37e7154 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,19 +1,49 @@ -import os -import pandas as pd + import argparse +# import boto3 +import os +from pathlib import Path +from datetime import datetime from typing import List -from Logger import logger -from autogluon.tabular import TabularDataset, TabularPredictor +from core.Logger import logger +from core.DataLoader import DataLoader +from core.FeatureProcessor import FeatureProcessor +from MLModel.Models import AutogluonModel +import pandas as pd +from core.Settings import ( + MODEL_DIRECTORY, + BASE_REGISTRY_PATH, + REGISTRY_FILE, + MODEL_FOLDER, + METRICS_FOLDER, + DEPLOYMENT_FOLDER, + SUBSAMPLE_FACTOR, + MODEL_HYPERPARAMETERS +) +import seaborn as sns +import matplotlib.pyplot as plt - -DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE'] -FEATURE_COLUMNS = None -RANDOM_SEED = 0 +TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") # FOR TESTING -train_filepath = "./model_build_data/train_validation_data.parquet" -test_filepath = "./model_build_data/test_data.parquet" +# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet" +# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet" +# target_column = "RDSAP_CHANGE" +# model_type = "autogluon" +# hyperparameter = HYPERPARAMETERS +# SUBSAMPLE_FACTOR = 200 +# SESSION = boto3.Session() + +# S3_CLIENT = SESSION.client( +# service_name="s3", +# aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'), +# aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'), +# endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000") +# ) + +# S3_CLIENT.create_bucket +# S3_CLIENT.list_buckets() def ingest_arguments() -> argparse.Namespace: """ @@ -22,116 +52,148 @@ def ingest_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser(description='Inputs for training script') - parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training') - parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing') + parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True) + parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True) + parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon") + parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE') args = parser.parse_args() return args - - -class DataLoader(): - - @staticmethod - def load(filepath: str) -> pd.DataFrame: - """ - Load different datasets - """ - if filepath.endswith('.parquet'): - df = pd.read_parquet(filepath) - elif filepath.endswith('.csv.'): - df = pd.read_csv(filepath) - else: - logger.error('Not implemented!') - exit(1) - - return df - -class FeatureProcessor: - """ - Handle all feature manipulation before modelling - """ - - @staticmethod - def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame: - df = df.drop(columns=[drop_columns]) - return df - - def retain_features(df: pd.DataFrame, features: List[str] = None): - """ - Determine which columns to keep ofr modelling - """ - if features is None: - features = df.columns - else: - if not set(features).issubset(df.columns): - logger.error('Features defined is not contained in data') - exit(1) - - df = df[features] - - return df - - def process(self, df: pd.DataFrame) -> pd.DataFrame: - df = self.drop_columns(df, drop_columns=DROP_COLUMNS) - df = self.retain_features(df, features=FEATURE_COLUMNS) - return df - -def training(train_filepath: str, test_filepath: str) -> None: +def training( + train_filepath: str, + test_filepath: str, + target_column: str = "RDSAP_CHANGE", + model_type: str = "autogluon", + hyperparameters: dict = None + ) -> None: """ Pipeline to run training on the dataset """ - logger.info('Loading data') + logger.info('--- Loading data ---') dataloader = DataLoader() train_df = dataloader.load(filepath=train_filepath) test_df = dataloader.load(filepath=test_filepath) - - # df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE']) - logger.info('Feature processing') + logger.info('--- Feature processing ---') + feature_processor = FeatureProcessor() - train_df = feature_processor.process(train_df) - test_df = feature_processor.process(test_df) - # logger.info('Split data into train and validation') + subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR) - logger.info('Build Model') + train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount) + test_df = feature_processor.process(test_df, target_column=target_column) + + logger.info('--- Build Model ---') + + logger.info("--- Load Hyperparameters ---") + + if hyperparameters is None: + logger.info("Use base hyperparameters in settings") + hyperparameters = MODEL_HYPERPARAMETERS[model_type] + logger.info(f'Hyperparameters are: {hyperparameters}') + + if model_type == "autogluon": + model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower() + output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root + + model = AutogluonModel( + output_filepath = output_base / MODEL_FOLDER + ) + else: + logger.error("No alternative model implemented yet") + exit(1) - data = TabularDataset(data=train_filepath) - data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE']) - TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT'] - # top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))] + model.train_model( + data=train_df, + target_column=target_column, + hyperparameters=hyperparameters + ) + + logger.info("--- Save Model ---") + model.save_model(output_filepath=model.output_filepath) - data = data[['RDSAP_CHANGE'] + top_features.to_list()] - # data = TabularDataset(data=train_df) - # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float) - subsample_size = round(len(data)/20) - data = data.sample(subsample_size, random_state=RANDOM_SEED) + logger.info('--- Generate evaluation metrics ---') + metrics_df = model.model_evaluation( + validation_data=test_df, + target_column=target_column, + metrics_location = output_base / METRICS_FOLDER + ) + + logger.info("--- Generate metric outputs using predictions ---") + # TODO: can have a model.metric_outputs method + # FOr not just do it here + residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred']) + + # image formatting + # TODO: move to settings file , AXIS_FONT, TITLE_FONT + axis_fs = 18 #fontsize + title_fs = 22 #fontsize + sns.set(style="whitegrid") + ax = sns.scatterplot(x="true", y="pred",data=residual_df) + ax.set_aspect('equal') + ax.set_xlabel(f'True {target_column}',fontsize = axis_fs) + ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel + ax.set_title('Residuals', fontsize = title_fs) - # Add custom metric class MAPE - # Have a look at temporal features + # Square aspect ratio + ax.plot([-100, 100], [-100, 100], 'black', linewidth=1) - target_column = 'RDSAP_CHANGE' - predictor_RDSAP = TabularPredictor( - label=target_column, - path="agModels-predictRDSAP", - problem_type="regression", - eval_metric='mean_absolute_error' - ).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN']) + plt.tight_layout() + RESIDUAL_FILE = "residuals.png" + plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) + # TODO: for cml, we might want to have class that outputs all data and plots to add to the report + # If we want residual plot/ any plots, we will need to self host + # plt.savefig(RESIDUAL_FILE, dpi=120) + + # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment + # Imagining for now that the model trained here is the best model amongst all models built - logger.info('Evaluate matrics') + logger.info("--- Optimising model for deployment ---") - test_data = TabularDataset('./model_build_data/test_data.parquet') - performance = predictor_RDSAP.evaluate(test_data) - predictions = predictor_RDSAP.predict(test_data) + deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER) + logger.info(f"Optimised version of best model can be found at: {deployment_model_path}") + + # TODO: Need a model registry - for now have this as a CSV + # Save this in the model directory + logger.info("--- Append registry with new model ---") + + registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE + + if registry_path.exists(): + logger.info("Registry file found - Loading into Dataframe") + registry_df = pd.read_csv(registry_path, index_col=None) + else: + # TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns + registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model']) + + model_details_df = pd.DataFrame( + [{ + 'model_type': model_type, + 'model_name': model_root, + 'model_location': deployment_model_path + }] + ) + + registry_row = pd.concat([model_details_df, metrics_df], axis=1) + registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True) + + # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics + # TODO: decide metric to optimise to + registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True) + registry_df['best_model'] = [False]*len(registry_df) + registry_df.loc[0, 'best_model'] = True + + logger.info("--- Saving new model to registry ---") + registry_df.to_csv(registry_path, index=False) + + logger.info("--- Training Pipeline Complete --- ") - test_data['predictions'] = predictions - test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions']) if __name__ == "__main__": @@ -140,4 +202,11 @@ if __name__ == "__main__": logger.info('---Ingest Arguments---') args = ingest_arguments() - training(train_filepath=args.train_filepath, test_filepath=args.test_filepath) \ No newline at end of file + # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet + # TODO: Ingest hyper parameters from somewhere - currently change at the top of script + training( + train_filepath=args.train_filepath, + test_filepath=args.test_filepath, + target_column=args.target_column, + model_type=args.model_type + )