From c62f32c1e505616e816d3a6978a0439e462ed569 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 17 Sep 2023 10:50:26 +0000 Subject: [PATCH 1/3] start refactor of dataclient --- .../src/pipeline/src/configs/build_model.yaml | 10 +- .../src/configs/feature_processor.yaml | 4 +- .../pipeline/src/configs/prepare_data.yaml | 2 +- .../src/pipeline/src/core/DataClient.py | 332 ++++++++++-------- .../src/core/interface/InterfaceDataClient.py | 42 +-- modules/ml-pipeline/src/pipeline/src/dvc.lock | 51 +-- .../src/pipeline/src/prepare_data.py | 55 +-- 7 files changed, 256 insertions(+), 240 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml index a1307c1..a8510c6 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml @@ -1,5 +1,5 @@ -model_type: AutogluonAutoML -model_save_filepath: ./data/model/autogluonmodel/ +model_type: SKLearnLinearRegression +model_save_filepath: ./data/model/model.joblib SKLearnLinearRegression: null @@ -10,6 +10,6 @@ AutogluonAutoML: output_filepath: ./data/model/autogluonmodel/ problem_type: regression eval_metric: mean_absolute_error - time_limit: 200 - presets: medium_quality - excluded_model_types: null + time_limit: 400 + presets: high_quality + excluded_model_types: ['KNN'] diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml index 9aa02f0..03c142d 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml @@ -4,5 +4,5 @@ feature_processor_config: subsample_seed: 0 target: RDSAP_CHANGE drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"] - # retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "TOTAL_FLOOR_AREA_ENDING"] - retain_features: null + retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"] + # retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml index 273e78d..cf99d6a 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml @@ -2,7 +2,7 @@ input_dataclient_type: aws-s3 output_dataclient_type: local datahandler_type: parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet -train_proportion: 0.1 +train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py b/modules/ml-pipeline/src/pipeline/src/core/DataClient.py index cb5b8d7..98ad821 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/src/core/DataClient.py @@ -4,27 +4,34 @@ Implementations of the DataClient Protocol import os import boto3 +import pandas as pd from pathlib import Path from io import BytesIO -from typing import List +from typing import List, Union from core.interface.InterfaceDataClient import DataClient from core.Logger import logger -def dataclient_factory(dataclient_type: str) -> DataClient: +def dataclient_factory( + dataclient_type: str, dataclient_config: Union[dict, None] +) -> DataClient: """ Determine which dataclient to use """ + + if dataclient_config is None: + dataclient_config = {} + dataclients = { - "local": LocalClient(), - "aws-s3": AWSS3Client(), + "local": LocalClient, + "aws-s3": AWSS3Client, # ADD MORE DATACLIENTS HERE } if dataclient_type not in dataclients: raise ValueError("Dataclient type specified is not in factory") - return dataclients[dataclient_type] + return dataclients[dataclient_type](**dataclient_config) def validate_dict_keys(keys_1: List[str], keys_2: List[str], config_type: str): @@ -32,144 +39,142 @@ def validate_dict_keys(keys_1: List[str], keys_2: List[str], config_type: str): raise ValueError(f"Incorrect {config_type} keys specified") -# class MinioClient: -# """ -# Using the Minio s3 client, to do local testing -# """ - -# ACCEPTED_CONFIG_KEYS = [ -# "aws_access_key_id", -# "aws_secret_access_key", -# "endpoint_url", -# ] -# ACCEPTED_LOAD_CONFIG_KEYS = [] -# ACCEPTED_SAVE_CONFIG_KEYS = [] - -# def ingest_configurations(self, config: dict) -> None: -# """ -# Load all configuration into the instance (self.config) -# """ -# validate_dict_keys( -# keys_1=list(config.keys()), -# keys_2=self.ACCEPTED_CONFIG_KEYS, -# config_type="config", -# ) - -# self.config = config - -# def establish_client(self) -> None: -# """ -# With the given configurations, create the connection to the client (self.client) -# """ - -# ... - -# def download_data(self, download_config: dict) -> pd.DataFrame: -# """ -# When the client is established, we can load data -# """ -# validate_dict_keys( -# keys_1=list(download_config.keys()), -# keys_2=self.ACCEPTED_LOAD_CONFIG_KEYS, -# config_type="load_config", -# ) - -# return pd.DataFrame() - -# def save_data(self, obj: object, save_config: dict) -> None: -# """ -# When the client is established, we can save out objects -# """ -# validate_dict_keys( -# keys_1=list(save_config.keys()), -# keys_2=self.ACCEPTED_SAVE_CONFIG_KEYS, -# config_type="save_config", -# ) - - class AWSS3Client: """ Using Boto3, set up the AWS client """ - ACCEPTED_CONFIG_KEYS = [ - "AWS_ACCESS_KEY_ID", - "AWS_SECRET_ACCESS_KEY", - ] + def __init__( + self, + AWS_ACCESS_KEY_ID: Union[str, None], + AWS_SECRET_ACCESS_KEY: Union[str, None], + ENDPOINT_URL: Union[str, None], + ): + self.AWS_ACCESS_KEY_ID = AWS_ACCESS_KEY_ID + self.AWS_SECRET_ACCESS_KEY = AWS_SECRET_ACCESS_KEY + self.ENDPOINT_URL = ENDPOINT_URL + + self._establish_client() + ACCEPTED_LOAD_CONFIG_KEYS = [] ACCEPTED_SAVE_CONFIG_KEYS = [] - def ingest_configurations(self, config: dict) -> None: - """ - Load all configuration into the instance (self.config) - """ - validate_dict_keys( - keys_1=self.ACCEPTED_CONFIG_KEYS, - keys_2=list(config.keys()), - config_type="Ingest Config", - ) - self.config = config - - def establish_client(self) -> None: + def _establish_client(self) -> None: """ With the given configurations, create the connection to the client (self.client) """ logger.info(f"Establishing S3 Client") session = boto3.Session() - if ( - self.config["AWS_ACCESS_KEY_ID"] is None - and self.config["AWS_SECRET_ACCESS_KEY"] is None - ): + if self.AWS_ACCESS_KEY_ID is None and self.AWS_SECRET_ACCESS_KEY is None: self.client = session.client(service_name="s3") # Using local credentials else: self.client = session.client( service_name="s3", - aws_access_key_id=self.config["AWS_ACCESS_KEY_ID"], - aws_secret_access_key=self.config["AWS_SECRET_ACCESS_KEY"], + aws_access_key_id=self.AWS_ACCESS_KEY_ID, + aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY, + endpoint_url=self.ENDPOINT_URL, ) - def download_data(self, location: dict) -> None: + def load_data( + self, location: str, filetype: str, load_config: Union[dict, None] = None + ) -> pd.DataFrame: """ - When the client is established, we can download data to a local file + Generic to load data """ - ... - def load_data_as_buffer(self, location: str) -> BytesIO: - """ - When the client is established, we can load data in a buffer - """ if not location.startswith("s3://"): raise ValueError("S3 file path specified without s3://") + if load_config is None: + load_config = {} + + load_methods = { + ".parquet": self._load_parquet, + # "": _load_directory(**load_config), + # ADD MORE load_methods HERE + } + + if filetype not in load_methods: + raise ValueError("load methods specified is not in factory") + + return load_methods[filetype](location=location, load_config=load_config) + + def save_data( + self, + obj: object, + location: str, + filetype: str, + save_config: Union[dict, None] = None, + ) -> None: + """ + Generic to save data + """ + + if not location.startswith("s3://"): + raise ValueError("S3 file path specified without s3://") + + if save_config is None: + save_config = {} + + save_methods = { + ".parquet": self._save_parquet, + # "": _save_directory(**save_config), + # ADD MORE save_methods HERE + } + + if filetype not in save_methods: + raise ValueError("save_methods specified is not in factory") + + return save_methods[filetype]( + obj=obj, location=location, save_config=save_config + ) + + def _save_parquet(self, obj: object, location: str, save_config: dict): + """ + Save object as parquet + """ + + buffer = BytesIO() + obj.to_parquet(buffer, index=False) + + bucket, key = location.strip("s3://").split("/", 1) + self.client.upload_fileobj(buffer, bucket, key) + + def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame: + """ + Load a parquet file + """ + bucket, key = location.strip("s3://").split("/", 1) buffer = BytesIO() self.client.download_fileobj(bucket, key, buffer) - buffer.seek(0) - return buffer + df = pd.read_parquet(buffer, **load_config) - def load_database(self, database_location: dict) -> None: - """ - When the client is established, we can read from a database - """ - ... + return df - def upload_data(self, location: str) -> None: - """ - When the client is established, we can save out objects from a local file - """ - ... + # def load_data_as_buffer(self, location: str) -> BytesIO: + # """ + # When the client is established, we can load data in a buffer + # """ - def upload_data_from_buffer(self, buffer: BytesIO, location: str) -> None: - """ - When the client is established, we can save out objects from a buffer - """ - if not location.startswith("s3://"): - raise ValueError("S3 file path specified without s3://") + # bucket, key = location.strip("s3://").split("/", 1) + # buffer = BytesIO() + # self.client.download_fileobj(bucket, key, buffer) + # buffer.seek(0) - bucket, key = location.strip("s3://").split("/", 1) - self.client.upload_fileobj(buffer, bucket, key) + # return buffer + + # def upload_data_from_buffer(self, buffer: BytesIO, location: str) -> None: + # """ + # When the client is established, we can save out objects from a buffer + # """ + # if not location.startswith("s3://"): + # raise ValueError("S3 file path specified without s3://") + + # bucket, key = location.strip("s3://").split("/", 1) + # self.client.upload_fileobj(buffer, bucket, key) class LocalClient: @@ -177,54 +182,103 @@ class LocalClient: Interacting with data locally """ - def ingest_configurations(self, config: dict) -> None: + def __init__(self): """ - Load all configuration into the instance (self.config) + No initialisation needed for local client """ logger.info("Local - No configuration required") + self._establish_client() - def establish_client(self) -> None: + def _establish_client(self) -> None: """ With the given configurations, create the connection to the client (self.client) """ logger.info("Local - No establishing client required") - def download_data(self, location: dict) -> None: + def load_data( + self, location: str, filetype: str, load_config: Union[dict, None] = None + ) -> pd.DataFrame: """ - When the client is established, we can download data to a file + Generic to load data """ - ... - def load_data_as_buffer(self, location: str) -> BytesIO: - """ - When the client is established, we can load data from a buffer - """ - with open(location, "rb") as file: - # Read the entire file into a BytesIO object - buffer = BytesIO(file.read()) - buffer.seek(0) + if load_config is None: + load_config = {} - return buffer + load_methods = { + ".parquet": self._load_parquet, + # "": _load_directory(**load_config), + # ADD MORE load_methods HERE + } - def load_database(self, database_location: dict) -> None: - """ - When the client is established, we can read from a database - """ - ... + if filetype not in load_methods: + raise ValueError("load methods specified is not in factory") - def upload_data(self, location: str) -> None: - """ - When the client is established, we can save out objects from a file - """ - ... + return load_methods[filetype](location=location, load_config=load_config) - def upload_data_from_buffer(self, buffer: BytesIO, location: str) -> None: + def save_data( + self, + obj: object, + location: str, + filetype: str, + save_config: Union[dict, None] = None, + ) -> None: """ - When the client is established, we can save out objects from a buffer + Generic to save data """ if not Path(location).parent.exists(): os.makedirs(Path(location).parent) - # Write the contents of the buffer to the local file - with open(location, "wb") as f: - f.write(buffer.getvalue()) + if save_config is None: + save_config = {} + + save_methods = { + ".parquet": self._save_parquet, + # "": _save_directory(**save_config), + # ADD MORE save_methods HERE + } + + if filetype not in save_methods: + raise ValueError("save_methods specified is not in factory") + + return save_methods[filetype]( + obj=obj, location=location, save_config=save_config + ) + + def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame: + """ + Load a parquet file + """ + + df = pd.read_parquet(location, **load_config) + + return df + + def _save_parquet(self, obj: object, location: str, save_config: dict): + """ + Save object as parquet + """ + + obj.to_parquet(location, **save_config) + + # def load_data_as_buffer(self, location: str) -> BytesIO: + # """ + # When the client is established, we can load data from a buffer + # """ + # with open(location, "rb") as file: + # # Read the entire file into a BytesIO object + # buffer = BytesIO(file.read()) + # buffer.seek(0) + + # return buffer + + # def upload_data_from_buffer(self, buffer: BytesIO, location: str) -> None: + # """ + # When the client is established, we can save out objects from a buffer + # """ + # if not Path(location).parent.exists(): + # os.makedirs(Path(location).parent) + + # # Write the contents of the buffer to the local file + # with open(location, "wb") as f: + # f.write(buffer.getvalue()) diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py index 3350714..bfd8687 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py +++ b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py @@ -4,7 +4,7 @@ Interface for all DataClient i.e. s3, database, local etc import pandas as pd from io import BytesIO -from typing import Protocol +from typing import Protocol, Union class DataClient(Protocol): @@ -12,44 +12,22 @@ class DataClient(Protocol): Declare the methods required for a DataClient """ - def ingest_configurations(self, config: dict) -> None: - """ - Load all configuration into the instance (self.config) - """ - ... - - def establish_client(self) -> None: + def _establish_client(self) -> None: """ With the given configurations, create the connection to the client (self.client) """ ... - def download_data(self, location: dict) -> None: + def load_data( + self, location: str, filetype: str, load_config: Union[dict, None] + ) -> pd.DataFrame: """ - When the client is established, we can load data + Generic to load data """ - ... - def load_data_as_buffer(self, location: str) -> BytesIO: + def save_data( + self, obj: object, location: str, filetype: str, save_config: Union[dict, None] + ) -> None: """ - When the client is established, we can load data + Generic to save data """ - ... - - def load_database(self, database_location: dict) -> None: - """ - When the client is established, we can read from a database - """ - ... - - def upload_data(self, location: str) -> None: - """ - When the client is established, we can save out objects - """ - ... - - def upload_data_from_buffer(self, buffer: BytesIO, location: str) -> None: - """ - When the client is established, we can save out objects - """ - ... diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index f04423a..33000e8 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -11,12 +11,12 @@ stages: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet output_train_filepath: ./data/prepared_data/train.parquet - train_proportion: 0.1 + train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 - md5: febdc8362200167078dfa578cf2bc889.dir - size: 24296908 + md5: 5cbabd20ff23b9d6734c5c68684dc8dc.dir + size: 11982694 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 3948 - path: data/prepared_data hash: md5 - md5: febdc8362200167078dfa578cf2bc889.dir - size: 24296908 + md5: 5cbabd20ff23b9d6734c5c68684dc8dc.dir + size: 11982694 nfiles: 2 params: configs/build_model.yaml: @@ -36,32 +36,33 @@ stages: output_filepath: ./data/model/autogluonmodel/ problem_type: regression eval_metric: mean_absolute_error - time_limit: 200 - presets: medium_quality + time_limit: 400 + presets: high_quality excluded_model_types: + - KNN SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear - model_save_filepath: ./data/model/autogluonmodel/ - model_type: AutogluonAutoML + model_save_filepath: ./data/model/model.joblib + model_type: SKLearnLinearRegression outs: - path: data/model/ hash: md5 - md5: 154f823d56a9892948a633789d9b08a5.dir - size: 680552724 - nfiles: 18 + md5: f53ceced818ffe9e3ae327492d5a049a.dir + size: 1832 + nfiles: 1 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 154f823d56a9892948a633789d9b08a5.dir - size: 680552724 - nfiles: 18 + md5: f53ceced818ffe9e3ae327492d5a049a.dir + size: 1832 + nfiles: 1 - path: data/prepared_data hash: md5 - md5: febdc8362200167078dfa578cf2bc889.dir - size: 24296908 + md5: 5cbabd20ff23b9d6734c5c68684dc8dc.dir + size: 11982694 nfiles: 2 - path: generate_predictions.py hash: md5 @@ -77,21 +78,21 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: d8abefde18d78588158ef6acf282e2ed.dir - size: 2948553 + md5: e71d1d864228b3f3994217bfcdbcc5b7.dir + size: 643090 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: d8abefde18d78588158ef6acf282e2ed.dir - size: 2948553 + md5: e71d1d864228b3f3994217bfcdbcc5b7.dir + size: 643090 nfiles: 1 - path: data/prepared_data hash: md5 - md5: febdc8362200167078dfa578cf2bc889.dir - size: 24296908 + md5: 5cbabd20ff23b9d6734c5c68684dc8dc.dir + size: 11982694 nfiles: 2 - path: generate_metrics.py hash: md5 @@ -107,8 +108,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: f5aaae75ea74241500cd1ce76751c579 - size: 182 + md5: 915100dc1b46b4517a3e1d71d211849d + size: 179 startup_cleanup: cmd: python startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/src/prepare_data.py b/modules/ml-pipeline/src/pipeline/src/prepare_data.py index 400adbf..7238513 100644 --- a/modules/ml-pipeline/src/pipeline/src/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/src/prepare_data.py @@ -6,17 +6,14 @@ Loading data from a client import os import yaml import pandas as pd -from typing import Optional, Tuple, Union +from typing import Tuple, Union from pathlib import Path -from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split from core.interface.InterfaceDataClient import DataClient -from core.interface.InterfaceDataHandler import DataHandler from core.interface.InterfaceFeatureProcessor import FeatureProcessor from configs.feature_processor_logic import business_logic, new_feature_funcs from core.Logger import logger from core.DataClient import dataclient_factory -from core.DataHandler import datahandler_factory from core.FeatureProcessor import feature_processor_factory RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -31,20 +28,9 @@ feature_process_path = Path(__file__).parent / "configs" / "feature_processor.ya feature_process_params = yaml.safe_load(open(feature_process_path)) -def use_dummy_data() -> pd.DataFrame: - diabetes_data = load_diabetes() - - x_data = pd.DataFrame(diabetes_data["data"], columns=diabetes_data["feature_names"]) # type: ignore - y_data = pd.DataFrame(diabetes_data["target"], columns=["target"]) # type: ignore - - data = pd.concat([x_data, y_data], axis=1) - return data - - def prepare_data( input_dataclient: DataClient, output_dataclient: DataClient, - datahandler: DataHandler, feature_processor: FeatureProcessor, data_filepath: str, train_proportion: float, @@ -64,7 +50,11 @@ def prepare_data( logger.info("--- Loading data ---") logger.info("--------------------") - data = datahandler.load_data(dataclient=input_dataclient, location=data_filepath) + data_filetype = Path(data_filepath).suffix + + data = input_dataclient.load_data( + location=data_filepath, filetype=data_filetype, load_config={} + ) logger.info("--------------------------") logger.info("--- Feature Processing ---") @@ -93,13 +83,15 @@ def prepare_data( logger.info("--- Outputting data ---") logger.info("-----------------------") - datahandler.save_data( - dataclient=output_dataclient, obj=train, location=output_train_filepath + output_train_filetype = Path(output_train_filepath).suffix + output_dataclient.save_data( + obj=train, location=output_train_filepath, filetype=output_train_filetype ) if test is not None: - datahandler.save_data( - dataclient=output_dataclient, obj=test, location=output_test_filepath + output_test_filetype = Path(output_test_filepath).suffix + output_dataclient.save_data( + obj=test, location=output_test_filepath, filetype=output_test_filetype ) return train, test @@ -118,22 +110,14 @@ if __name__ == "__main__": input_dataclient_type = prepare_data_params["input_dataclient_type"] output_dataclient_type = prepare_data_params["output_dataclient_type"] - input_dataclient = dataclient_factory(input_dataclient_type) - output_dataclient = dataclient_factory(output_dataclient_type) - - input_dataclient.ingest_configurations(config=client_params[input_dataclient_type]) - input_dataclient.establish_client() - - output_dataclient.ingest_configurations( - config=client_params[output_dataclient_type] + input_dataclient = dataclient_factory( + dataclient_type=input_dataclient_type, + dataclient_config=client_params[input_dataclient_type], + ) + output_dataclient = dataclient_factory( + dataclient_type=output_dataclient_type, + dataclient_config=client_params[output_dataclient_type], ) - output_dataclient.establish_client() - - logger.info("-----------------------------") - logger.info(f"--- Initiate DataHandler ---") - logger.info("-----------------------------") - - datahandler = datahandler_factory(prepare_data_params["datahandler_type"]) logger.info("----------------------------------") logger.info(f"--- Initiate FeatureProcessor ---") @@ -150,7 +134,6 @@ if __name__ == "__main__": prepare_data( input_dataclient=input_dataclient, output_dataclient=output_dataclient, - datahandler=datahandler, feature_processor=feature_processor, data_filepath=prepare_data_params["data_filepath"], train_proportion=prepare_data_params["train_proportion"], From 2d7af3ed69454c14310c16639320ac13858171ea Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 17 Sep 2023 11:22:09 +0000 Subject: [PATCH 2/3] refactored dataclient --- .../src/pipeline/src/build_model.py | 14 +-- .../src/pipeline/src/core/DataClient.py | 54 ++++++------ .../src/pipeline/src/core/DataHandler.py | 86 ------------------- .../src/core/interface/InterfaceDataClient.py | 6 +- .../core/interface/InterfaceDataHandler.py | 26 ------ modules/ml-pipeline/src/pipeline/src/dvc.lock | 48 +++++------ .../src/pipeline/src/generate_metrics.py | 39 +++------ .../src/pipeline/src/generate_predictions.py | 31 +++---- .../src/pipeline/src/prepare_data.py | 16 +--- 9 files changed, 85 insertions(+), 235 deletions(-) delete mode 100644 modules/ml-pipeline/src/pipeline/src/core/DataHandler.py delete mode 100644 modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataHandler.py diff --git a/modules/ml-pipeline/src/pipeline/src/build_model.py b/modules/ml-pipeline/src/pipeline/src/build_model.py index dde3035..029ad99 100644 --- a/modules/ml-pipeline/src/pipeline/src/build_model.py +++ b/modules/ml-pipeline/src/pipeline/src/build_model.py @@ -12,7 +12,6 @@ from core.Logger import logger from core.interface.InterfaceModels import MLModel from core.interface.InterfaceDataClient import DataClient from core.DataClient import dataclient_factory -from core.DataHandler import datahandler_factory from core.MLModels import model_factory RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -46,14 +45,12 @@ def build_model( if train_data is None: if train_filepath is None: raise ValueError(f"Need {train_filepath} if no data supplied") - train_data = datahandler.load_data( - dataclient=dataclient, location=train_filepath - ) + train_data = dataclient.load_data(location=train_filepath) if test_data is None: if test_filepath is None: raise ValueError(f"Need {test_filepath} if no data supplied") - test_data = datahandler.load_data(dataclient=dataclient, location=test_filepath) + test_data = dataclient.load_data(location=test_filepath) logger.info("----------------------") logger.info("--- Training model ---") @@ -80,14 +77,9 @@ if __name__ == "__main__": logger.info(f"--- Initiate DataClient ---") logger.info("----------------------------") + # Output of previous prepare data step, will be where the data is dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"]) - logger.info("-----------------------------") - logger.info(f"--- Initiate DataHandler ---") - logger.info("-----------------------------") - - datahandler = datahandler_factory(prepare_data_params["datahandler_type"]) - logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") logger.info("-------------------------") diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py b/modules/ml-pipeline/src/pipeline/src/core/DataClient.py index 98ad821..c8c9f2c 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/src/core/DataClient.py @@ -3,6 +3,7 @@ Implementations of the DataClient Protocol """ import os +import json import boto3 import pandas as pd from pathlib import Path @@ -13,7 +14,7 @@ from core.Logger import logger def dataclient_factory( - dataclient_type: str, dataclient_config: Union[dict, None] + dataclient_type: str, dataclient_config: Union[dict, None] = None ) -> DataClient: """ Determine which dataclient to use @@ -77,7 +78,7 @@ class AWSS3Client: ) def load_data( - self, location: str, filetype: str, load_config: Union[dict, None] = None + self, location: str, load_config: Union[dict, None] = None ) -> pd.DataFrame: """ Generic to load data @@ -89,6 +90,8 @@ class AWSS3Client: if load_config is None: load_config = {} + filetype = Path(location).suffix + load_methods = { ".parquet": self._load_parquet, # "": _load_directory(**load_config), @@ -104,7 +107,6 @@ class AWSS3Client: self, obj: object, location: str, - filetype: str, save_config: Union[dict, None] = None, ) -> None: """ @@ -117,6 +119,8 @@ class AWSS3Client: if save_config is None: save_config = {} + filetype = Path(location).suffix + save_methods = { ".parquet": self._save_parquet, # "": _save_directory(**save_config), @@ -196,7 +200,7 @@ class LocalClient: logger.info("Local - No establishing client required") def load_data( - self, location: str, filetype: str, load_config: Union[dict, None] = None + self, location: str, load_config: Union[dict, None] = None ) -> pd.DataFrame: """ Generic to load data @@ -205,6 +209,8 @@ class LocalClient: if load_config is None: load_config = {} + filetype = Path(location).suffix + load_methods = { ".parquet": self._load_parquet, # "": _load_directory(**load_config), @@ -220,7 +226,6 @@ class LocalClient: self, obj: object, location: str, - filetype: str, save_config: Union[dict, None] = None, ) -> None: """ @@ -234,10 +239,13 @@ class LocalClient: save_methods = { ".parquet": self._save_parquet, + ".json": self._save_json # "": _save_directory(**save_config), # ADD MORE save_methods HERE } + filetype = Path(location).suffix + if filetype not in save_methods: raise ValueError("save_methods specified is not in factory") @@ -254,31 +262,29 @@ class LocalClient: return df - def _save_parquet(self, obj: object, location: str, save_config: dict): + def _save_parquet(self, obj: pd.DataFrame, location: str, save_config: dict): """ Save object as parquet """ obj.to_parquet(location, **save_config) - # def load_data_as_buffer(self, location: str) -> BytesIO: - # """ - # When the client is established, we can load data from a buffer - # """ - # with open(location, "rb") as file: - # # Read the entire file into a BytesIO object - # buffer = BytesIO(file.read()) - # buffer.seek(0) + def _save_json(self, obj: dict, location: str, save_config: dict): + """ + Save object as json + """ + # Serialize the dictionary to a JSON-formatted string + json_string = json.dumps(obj) # indent for pretty formatting - # return buffer + # Convert the JSON string to bytes (UTF-8 encoding) + json_bytes = json_string.encode("utf-8") - # def upload_data_from_buffer(self, buffer: BytesIO, location: str) -> None: - # """ - # When the client is established, we can save out objects from a buffer - # """ - # if not Path(location).parent.exists(): - # os.makedirs(Path(location).parent) + # Create a BytesIO object and write the JSON bytes to it + buffer = BytesIO() + buffer.write(json_bytes) - # # Write the contents of the buffer to the local file - # with open(location, "wb") as f: - # f.write(buffer.getvalue()) + buffer.seek(0) + + # Write the contents of the buffer to the local file + with open(location, "wb") as f: + f.write(buffer.getvalue()) diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataHandler.py b/modules/ml-pipeline/src/pipeline/src/core/DataHandler.py deleted file mode 100644 index f5c07c1..0000000 --- a/modules/ml-pipeline/src/pipeline/src/core/DataHandler.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Implementations of the datahandler Protocol -""" - -import json -import pandas as pd -from io import BytesIO -from typing import List -from core.interface.InterfaceDataHandler import DataHandler -from core.interface.InterfaceDataClient import DataClient - - -def datahandler_factory(datahandler_type: str) -> DataHandler: - """ - Determine which dataclient to use - """ - datahandler = { - "parquet": ParquetHandler(), - "json": JSONHandler() - # ADD MORE DATACLIENTS HERE - } - - if datahandler_type not in datahandler: - raise ValueError("Dataloader type specified is not in factory") - - return datahandler[datahandler_type] - - -def validate_dict_keys(keys_1: List[str], keys_2: List[str], config_type: str): - if not set(keys_1).issubset(keys_2): - raise ValueError(f"Incorrect {config_type} keys specified") - - -class ParquetHandler: - """ - Load and save Parquet datasets - """ - - def load_data(self, dataclient: DataClient, location: str) -> pd.DataFrame: - """ - When the client is established, we can load data - """ - df = pd.read_parquet(dataclient.load_data_as_buffer(location=location)) - return df - - def save_data( - self, dataclient: DataClient, obj: pd.DataFrame, location: str - ) -> None: - """ - When the client is established, we can save out objects - """ - # Convert the Pandas DataFrame to a Parquet buffer - parquet_buffer = BytesIO() - obj.to_parquet(parquet_buffer, index=False) - - dataclient.upload_data_from_buffer(buffer=parquet_buffer, location=location) - - -class JSONHandler: - """ - Load and save Parquet datasets - """ - - def load_data(self, dataclient: DataClient, location: str) -> pd.DataFrame: - """ - When the client is established, we can load data - """ - ... - - def save_data(self, dataclient: DataClient, obj: dict, location: str) -> None: - """ - When the client is established, we can save out objects - """ - # Serialize the dictionary to a JSON-formatted string - json_string = json.dumps(obj) # indent for pretty formatting - - # Convert the JSON string to bytes (UTF-8 encoding) - json_bytes = json_string.encode("utf-8") - - # Create a BytesIO object and write the JSON bytes to it - buffer = BytesIO() - buffer.write(json_bytes) - - buffer.seek(0) - - dataclient.upload_data_from_buffer(buffer=buffer, location=location) diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py index bfd8687..d572c2b 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py +++ b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py @@ -18,15 +18,13 @@ class DataClient(Protocol): """ ... - def load_data( - self, location: str, filetype: str, load_config: Union[dict, None] - ) -> pd.DataFrame: + def load_data(self, location: str, load_config: Union[dict, None]) -> pd.DataFrame: """ Generic to load data """ def save_data( - self, obj: object, location: str, filetype: str, save_config: Union[dict, None] + self, obj: object, location: str, save_config: Union[dict, None] ) -> None: """ Generic to save data diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataHandler.py b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataHandler.py deleted file mode 100644 index 1c21144..0000000 --- a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataHandler.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Interface for all DataHandler i.e. Parquet data, csv data -""" - -import pandas as pd -from typing import Protocol, Union, Any -from core.interface.InterfaceDataClient import DataClient - - -class DataHandler(Protocol): - """ - Declare the methods required for a DataClient - """ - - def load_data(self, dataclient: DataClient, location: str) -> pd.DataFrame: - """ - When the client is established, we can load data - """ - ... - - def save_data( - self, dataclient: DataClient, obj: Union[pd.DataFrame, dict, Any], location: str - ) -> None: - """ - When the client is established, we can save out objects - """ diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index 33000e8..1f805aa 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: prepare_data.py hash: md5 - md5: 9c31bfb1b75ea3c9685ec459cbb50e62 - size: 5921 + md5: 2cfe9e3012280e0cecdb84da12c974d9 + size: 5009 params: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet @@ -15,20 +15,20 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 5cbabd20ff23b9d6734c5c68684dc8dc.dir - size: 11982694 + md5: ea0a2baf3931e692d6344ba609331089.dir + size: 13232732 nfiles: 2 build_model: cmd: python build_model.py deps: - path: build_model.py hash: md5 - md5: 662cd6b1562fbbc2c7d30dd0f2375a66 - size: 3948 + md5: 46bcc34f20c6851cd987640889eefde6 + size: 3671 - path: data/prepared_data hash: md5 - md5: 5cbabd20ff23b9d6734c5c68684dc8dc.dir - size: 11982694 + md5: ea0a2baf3931e692d6344ba609331089.dir + size: 13232732 nfiles: 2 params: configs/build_model.yaml: @@ -48,7 +48,7 @@ stages: outs: - path: data/model/ hash: md5 - md5: f53ceced818ffe9e3ae327492d5a049a.dir + md5: eb2b910dec66481e75bb6058622f6e55.dir size: 1832 nfiles: 1 generate_predictions: @@ -56,18 +56,18 @@ stages: deps: - path: data/model hash: md5 - md5: f53ceced818ffe9e3ae327492d5a049a.dir + md5: eb2b910dec66481e75bb6058622f6e55.dir size: 1832 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 5cbabd20ff23b9d6734c5c68684dc8dc.dir - size: 11982694 + md5: ea0a2baf3931e692d6344ba609331089.dir + size: 13232732 nfiles: 2 - path: generate_predictions.py hash: md5 - md5: 32c0ecd082e1f8fc4426338d6629979c - size: 4686 + md5: d412c8c9b48b59a29f569633280a6e7f + size: 4237 params: configs/generate_predictions.yaml: input_dataclient_type: local @@ -78,26 +78,26 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: e71d1d864228b3f3994217bfcdbcc5b7.dir - size: 643090 + md5: 85ec3fa0cb387a7775eccd23185f7966.dir + size: 643406 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: e71d1d864228b3f3994217bfcdbcc5b7.dir - size: 643090 + md5: 85ec3fa0cb387a7775eccd23185f7966.dir + size: 643406 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 5cbabd20ff23b9d6734c5c68684dc8dc.dir - size: 11982694 + md5: ea0a2baf3931e692d6344ba609331089.dir + size: 13232732 nfiles: 2 - path: generate_metrics.py hash: md5 - md5: 4709c42d93f8e717a3d9e4958e46cd76 - size: 4587 + md5: 5577a28107458dc1e6bcaaa098388095 + size: 4144 params: configs/generate_metrics.yaml: dataclient_type: local @@ -108,8 +108,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 915100dc1b46b4517a3e1d71d211849d - size: 179 + md5: d79f798a272e6b50597be4d08ae48fa8 + size: 180 startup_cleanup: cmd: python startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py index 3a5c668..7efeda9 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py @@ -5,17 +5,14 @@ After the model is built, we can evaluate its performance import os import yaml -import json import pandas as pd from pathlib import Path from core.interface.InterfaceModels import MLModel from core.interface.InterfaceMetrics import MLMetrics from core.interface.InterfaceDataClient import DataClient -from core.interface.InterfaceDataHandler import DataHandler from core.DataClient import dataclient_factory from core.MLModels import model_factory from core.MLMetrics import metrics_factory -from core.DataHandler import datahandler_factory from core.Logger import logger @@ -43,9 +40,8 @@ feature_process_params = yaml.safe_load(open(feature_process_path)) def generate_metrics( - dataclient: DataClient, - input_datahandler: DataHandler, - output_datahandler: DataHandler, + input_dataclient: DataClient, + output_dataclient: DataClient, model: MLModel, metrics: MLMetrics, target: str, @@ -62,17 +58,15 @@ def generate_metrics( logger.info("--- Loading test data ---") logger.info("-------------------------") - test_data = input_datahandler.load_data( - dataclient=dataclient, location=test_data_filepath + test_data = input_dataclient.load_data( + location=test_data_filepath, ) logger.info("---------------------------") logger.info("--- Loading predictions ---") logger.info("---------------------------") - predictions = input_datahandler.load_data( - dataclient=dataclient, location=predictions_output_filepath - ) + predictions = input_dataclient.load_data(location=predictions_output_filepath) logger.info("--------------------------") logger.info("--- Generating metrics ---") @@ -87,9 +81,7 @@ def generate_metrics( logger.info("--- Saving metrics ---") logger.info("----------------------") - output_datahandler.save_data( - dataclient=dataclient, obj=metrics_output, location=metrics_output_filepath - ) + output_dataclient.save_data(obj=metrics_output, location=metrics_output_filepath) if __name__ == "__main__": @@ -100,23 +92,18 @@ if __name__ == "__main__": model = model_factory(build_model_params["model_type"]) + # Use data client for input and output, as we use dvc to cache later to the cloud dataclient_type = generate_metrics_params["dataclient_type"] - dataclient = dataclient_factory(dataclient_type) - dataclient.ingest_configurations(client_params[dataclient_type]) - dataclient.establish_client() + dataclient = dataclient_factory( + dataclient_type=dataclient_type, + dataclient_config=client_params[dataclient_type], + ) - input_datahandler = datahandler_factory( - generate_metrics_params["input_datahandler_type"] - ) - output_datahandler = datahandler_factory( - generate_metrics_params["output_datahandler_type"] - ) metrics = metrics_factory(generate_metrics_params["metrics_type"]) generate_metrics( - dataclient=dataclient, - input_datahandler=input_datahandler, - output_datahandler=output_datahandler, + input_dataclient=dataclient, + output_dataclient=dataclient, model=model, metrics=metrics, target=feature_process_params["feature_processor_config"]["target"], diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py index 48e192b..f80ec18 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py @@ -5,15 +5,12 @@ After the model is built, we can evaluate its performance import os import yaml -import json import pandas as pd from pathlib import Path from core.interface.InterfaceModels import MLModel from core.interface.InterfaceDataClient import DataClient -from core.interface.InterfaceDataHandler import DataHandler from core.DataClient import dataclient_factory from core.MLModels import model_factory -from core.DataHandler import datahandler_factory from core.Logger import logger @@ -40,7 +37,6 @@ feature_process_params = yaml.safe_load(open(feature_process_path)) def generate_predictions( input_dataclient: DataClient, output_dataclient: DataClient, - datahandler: DataHandler, model: MLModel, target: str, model_filepath: str, @@ -56,9 +52,7 @@ def generate_predictions( logger.info("--- Loading test data ---") logger.info("-------------------------") - test_data = datahandler.load_data( - dataclient=input_dataclient, location=test_data_filepath - ) + test_data = input_dataclient.load_data(location=test_data_filepath) logger.info("---------------------") logger.info("--- Loading model ---") @@ -83,10 +77,8 @@ def generate_predictions( predictions_df = pd.DataFrame(predictions) predictions_df.columns = [predictions_column_name] - datahandler.save_data( - dataclient=output_dataclient, - obj=predictions_df, - location=predictions_output_filepath, + output_dataclient.save_data( + obj=predictions_df, location=predictions_output_filepath ) @@ -102,23 +94,20 @@ if __name__ == "__main__": # For predictions, we will want a cloud data client input_dataclient_type = generate_predictions_params["input_dataclient_type"] - input_dataclient = dataclient_factory(input_dataclient_type) - input_dataclient.ingest_configurations(config=client_params[input_dataclient_type]) - input_dataclient.establish_client() + input_dataclient = dataclient_factory( + dataclient_type=input_dataclient_type, + dataclient_config=client_params[input_dataclient_type], + ) output_dataclient_type = generate_predictions_params["output_dataclient_type"] - output_dataclient = dataclient_factory(output_dataclient_type) - output_dataclient.ingest_configurations( - config=client_params[output_dataclient_type] + output_dataclient = dataclient_factory( + dataclient_type=output_dataclient_type, + dataclient_config=client_params[output_dataclient_type], ) - output_dataclient.establish_client() - - datahandler = datahandler_factory(prepare_data_params["datahandler_type"]) generate_predictions( input_dataclient=input_dataclient, output_dataclient=output_dataclient, - datahandler=datahandler, model=model, target=feature_process_params["feature_processor_config"]["target"], model_filepath=build_model_params["model_save_filepath"], diff --git a/modules/ml-pipeline/src/pipeline/src/prepare_data.py b/modules/ml-pipeline/src/pipeline/src/prepare_data.py index 7238513..851be48 100644 --- a/modules/ml-pipeline/src/pipeline/src/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/src/prepare_data.py @@ -50,11 +50,7 @@ def prepare_data( logger.info("--- Loading data ---") logger.info("--------------------") - data_filetype = Path(data_filepath).suffix - - data = input_dataclient.load_data( - location=data_filepath, filetype=data_filetype, load_config={} - ) + data = input_dataclient.load_data(location=data_filepath, load_config={}) logger.info("--------------------------") logger.info("--- Feature Processing ---") @@ -83,16 +79,10 @@ def prepare_data( logger.info("--- Outputting data ---") logger.info("-----------------------") - output_train_filetype = Path(output_train_filepath).suffix - output_dataclient.save_data( - obj=train, location=output_train_filepath, filetype=output_train_filetype - ) + output_dataclient.save_data(obj=train, location=output_train_filepath) if test is not None: - output_test_filetype = Path(output_test_filepath).suffix - output_dataclient.save_data( - obj=test, location=output_test_filepath, filetype=output_test_filetype - ) + output_dataclient.save_data(obj=test, location=output_test_filepath) return train, test From b7e1cc441b5a5ba5b294cb237b5f6d95f09e6240 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 17 Sep 2023 11:30:47 +0000 Subject: [PATCH 3/3] try workflow --- .github/workflows/MLPipelinePostMerge.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index b8ab439..2cd4fe7 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -157,7 +157,13 @@ jobs: dvc push -r dev Register-New-Model-Dev: - if: github.event.pull_request.merged == true + needs: [Register-Major-Model-Dev, Register-Minor-Model-Dev, Register-Patch-Model-Dev] + if: | + always() && + (needs.Register-Major-Model-Dev.result == 'success' || needs.Register-Major-Model-Dev.result == 'skipped') && + (needs.Register-Minor-Model-Dev.result == 'success' || needs.Register-Minor-Model-Dev.result == 'skipped') && + (needs.Register-Patch-Model-Dev.result == 'success' || needs.Register-Patch-Model-Dev.result == 'skipped') + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -182,9 +188,10 @@ jobs: git config user.email "Github-Bot@no-reply.com" latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref) - + new_tag=${latest_version}#dev + git pull #Get new model registry md file changes git tag -a ${new_tag} -m "Registering Latest Version to Dev" git push origin ${new_tag} @@ -195,8 +202,7 @@ jobs: Register-Prediction-Image-Dev: - needs: Promote-Artefacts-To-Dev - # needs: [Promote-Artefacts-To-Dev, Register-New-Model-Dev] WILL ADD BACK ONCE REGISTER WORKS + needs: [Promote-Artefacts-To-Dev, Register-New-Model-Dev] runs-on: ubuntu-latest steps: