diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml index 0a059d6..a1307c1 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml @@ -1,7 +1,15 @@ -model_type: SKLearnLinearRegression -model_save_filepath: ./data/model/model.joblib +model_type: AutogluonAutoML +model_save_filepath: ./data/model/autogluonmodel/ SKLearnLinearRegression: null SKLearnSVMRegression: kernel: "linear" + +AutogluonAutoML: + output_filepath: ./data/model/autogluonmodel/ + problem_type: regression + eval_metric: mean_absolute_error + time_limit: 200 + presets: medium_quality + excluded_model_types: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml index 30dacbe..233a329 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml @@ -4,4 +4,5 @@ feature_processor_config: subsample_seed: 0 target: RDSAP_CHANGE drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"] - retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] + # retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] + retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml b/modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml new file mode 100644 index 0000000..909fb4b --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml @@ -0,0 +1,2 @@ +artefacts: ./data +metrics: ./metrics diff --git a/modules/ml-pipeline/src/pipeline/src/core/MLModels.py b/modules/ml-pipeline/src/pipeline/src/core/MLModels.py index 984c340..2c237ba 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/src/core/MLModels.py @@ -13,7 +13,9 @@ from pathlib import Path from typing import Union, List from sklearn import linear_model from sklearn.svm import SVR +from autogluon.tabular import TabularDataset, TabularPredictor from core.interface.InterfaceModels import MLModel +from core.Logger import logger def model_factory(model_type: str) -> MLModel: @@ -23,6 +25,7 @@ def model_factory(model_type: str) -> MLModel: models = { "SKLearnLinearRegression": SKLearnLinearRegression(), "SKLearnSVMRegression": SKLearnSVMRegression(), + "AutogluonAutoML": AutogluonAutoML() # ADD OTHER MODELS HERE } @@ -131,3 +134,78 @@ class SKLearnSVMRegression: """ self.predictions = pd.Series(self.model.predict(data)) return self.predictions + + +class AutogluonAutoML: + + ACCEPTED_MODEL_HYPERPAREMETERS = [ + "output_filepath", + "problem_type", + "eval_metric", + "time_limit", + "presets", + "excluded_model_types", + ] + + def load_model(self, path: Union[Path, str]) -> None: + """ + Method to load a model + """ + filepath = str(path) + self.model = TabularPredictor.load(path=filepath) + + def save_model(self, path: Path) -> str: + """ + Method to save a model + """ + if self.model is None: + raise KeyError("No model trained/ loaded - unable to save") + + logger.info("In local development mode - no need for s3 client") + logger.info("Using AutoGluon Model - Model saving already occured") + + return str(path) + + def train_model( + self, data: pd.DataFrame, target: str, model_hyperparameters: dict + ) -> None: + """ + Method to train a model + """ + + validate_dict_keys( + keys_1=list(model_hyperparameters.keys()), + keys_2=self.ACCEPTED_MODEL_HYPERPAREMETERS, + config_type="Model Hyperparameters", + ) + + if model_hyperparameters["output_filepath"] is None: + logger.error("Please specify a output_filepath in order to train a model") + exit(1) + + AGdata = TabularDataset(data=data) + + self.model = TabularPredictor( + label=target, + path=model_hyperparameters["output_filepath"], + problem_type=model_hyperparameters["problem_type"], + eval_metric=model_hyperparameters["eval_metric"], + ).fit( + AGdata, + time_limit=model_hyperparameters["time_limit"], + presets=model_hyperparameters["presets"], + excluded_model_types=model_hyperparameters["excluded_model_types"], + ) + + def predict(self, data: pd.DataFrame) -> pd.Series: + """ + Method to predict + """ + + if self.model is None: + print("No model loaded/ trained") + exit(1) + + predictions = pd.Series(self.model.predict(data)) + + return predictions diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index 7e8cd26..01a400f 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -23,8 +23,8 @@ stages: deps: - path: build_model.py hash: md5 - md5: 58315ea127dcc127e2c22ab1205fddb2 - size: 3925 + md5: 662cd6b1562fbbc2c7d30dd0f2375a66 + size: 3948 - path: data/prepared_data hash: md5 md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir @@ -32,25 +32,32 @@ stages: nfiles: 2 params: configs/build_model.yaml: + AutogluonAutoML: + output_filepath: ./data/model/autogluonmodel/ + problem_type: regression + eval_metric: mean_absolute_error + time_limit: 200 + presets: medium_quality + excluded_model_types: SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear - model_save_filepath: ./data/model/model.joblib - model_type: SKLearnLinearRegression + model_save_filepath: ./data/model/autogluonmodel/ + model_type: AutogluonAutoML outs: - path: data/model/ hash: md5 - md5: 40fa511f4f401f9d2c7da814afe198ef.dir - size: 920 - nfiles: 1 + md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir + size: 1264795580 + nfiles: 28 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 40fa511f4f401f9d2c7da814afe198ef.dir - size: 920 - nfiles: 1 + md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir + size: 1264795580 + nfiles: 28 - path: data/prepared_data hash: md5 md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir @@ -58,8 +65,8 @@ stages: nfiles: 2 - path: generate_predictions.py hash: md5 - md5: 13e920c0bae8ac51dd907631578f7045 - size: 4126 + md5: 76c45e7575ec979e6c4c8e2cf754a720 + size: 4225 params: configs/generate_predictions.yaml: input_dataclient_type: local @@ -70,16 +77,16 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir - size: 945933 + md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir + size: 672577 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir - size: 945933 + md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir + size: 672577 nfiles: 1 - path: data/prepared_data hash: md5 @@ -88,8 +95,8 @@ stages: nfiles: 2 - path: generate_metrics.py hash: md5 - md5: 6276995b5e860d0f0bb4545aa5f5d347 - size: 4259 + md5: cc368845f62523575a9ed5c791e27815 + size: 4329 params: configs/generate_metrics.yaml: dataclient_type: local @@ -100,5 +107,16 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 995ccf3c6c3f6a975d22aa9bc9f4964e - size: 181 + md5: 3f03e50a419af6730351a5016e2ae98a + size: 182 + startup_cleanup: + cmd: python startup_cleanup.py + deps: + - path: startup_cleanup.py + hash: md5 + md5: f7fe2ca33004b34530da0a3ab48c1790 + size: 1458 + params: + configs/startup_cleanup.yaml: + artefacts: ./data + metrics: ./metrics diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.yaml b/modules/ml-pipeline/src/pipeline/src/dvc.yaml index d1febb1..7e98535 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/src/dvc.yaml @@ -1,4 +1,12 @@ stages: + startup_cleanup: + cmd: python startup_cleanup.py + deps: + - startup_cleanup.py + params: + - configs/startup_cleanup.yaml: + - artefacts + - metrics prepare_data: cmd: python prepare_data.py deps: diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py index 4ab1503..552db47 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py @@ -77,7 +77,9 @@ def generate_predictions( logger.info("--- Saving predictions ---") logger.info("--------------------------") - predictions_df = pd.DataFrame(predictions, columns=[predictions_column_name]) + predictions_df = pd.DataFrame(predictions) + predictions_df.columns = [predictions_column_name] + datahandler.save_data( dataclient=output_dataclient, obj=predictions_df, diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt index 5aac406..b4679d0 100644 --- a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==1.5.3 -scikit-learn==1.3.0 +autogluon==0.8.2 pyarrow==13.0.0 pre-commit==3.3.3 sphinx==7.2.5 diff --git a/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py b/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py new file mode 100644 index 0000000..d30308c --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py @@ -0,0 +1,50 @@ +""" +We remove all previous artefacts in the data folder for a dvc run +""" + +import shutil +import yaml +from pathlib import Path +from core.Logger import logger + +startup_cleanup_path = Path(__file__).parent / "configs" / "startup_cleanup.yaml" +startup_cleanup_params = yaml.safe_load(open(startup_cleanup_path)) + + +def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: + """ + Remove the directory where artefacts are stored + """ + artefact_directory_path = Path(artefacts_directory) + + if artefact_directory_path.exists(): + + logger.info(f"Removing the directory: {artefacts_directory}") + shutil.rmtree(artefact_directory_path) + + metrics_directory_path = Path(metrics_directory) + + if metrics_directory_path.exists(): + + logger.info(f"Removing the directory: {metrics_directory}") + shutil.rmtree(metrics_directory_path) + + +if __name__ == "__main__": + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + logger.info("---------------------") + logger.info(f"--- Run Clean up ---") + logger.info("---------------------") + + run_cleanup( + artefacts_directory=startup_cleanup_params["artefacts"], + metrics_directory=startup_cleanup_params["metrics"], + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------------")