added autogluon model

2026-06-08 11:17:25 +00:00 · 2023-09-12 14:24:45 +01:00 · 2023-09-12 14:24:45 +01:00 · 72334aeb44
commit 72334aeb44
parent b3c9bc8fd7
9 changed files with 192 additions and 25 deletions
--- a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml
@ -1,7 +1,15 @@
-model_type: SKLearnLinearRegression
-model_save_filepath: ./data/model/model.joblib
+model_type: AutogluonAutoML
+model_save_filepath: ./data/model/autogluonmodel/

 SKLearnLinearRegression: null

 SKLearnSVMRegression:
  kernel: "linear"
+
+AutogluonAutoML:
+  output_filepath: ./data/model/autogluonmodel/
+  problem_type: regression
+  eval_metric: mean_absolute_error
+  time_limit: 200
+  presets: medium_quality
+  excluded_model_types: null
--- a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml
+++ b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml
@ -4,4 +4,5 @@ feature_processor_config:
  subsample_seed: 0
  target: RDSAP_CHANGE
  drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"]
-  retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
+  # retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
+  retain_features: null
--- a/modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml
+++ b/modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml
@ -0,0 +1,2 @@
+artefacts: ./data
+metrics: ./metrics
--- a/modules/ml-pipeline/src/pipeline/src/core/MLModels.py
+++ b/modules/ml-pipeline/src/pipeline/src/core/MLModels.py
@ -13,7 +13,9 @@ from pathlib import Path
 from typing import Union, List
 from sklearn import linear_model
 from sklearn.svm import SVR
+from autogluon.tabular import TabularDataset, TabularPredictor
 from core.interface.InterfaceModels import MLModel
+from core.Logger import logger


 def model_factory(model_type: str) -> MLModel:
@ -23,6 +25,7 @@ def model_factory(model_type: str) -> MLModel:
    models = {
        "SKLearnLinearRegression": SKLearnLinearRegression(),
        "SKLearnSVMRegression": SKLearnSVMRegression(),
+        "AutogluonAutoML": AutogluonAutoML()
        # ADD OTHER MODELS HERE
    }

@ -131,3 +134,78 @@ class SKLearnSVMRegression:
        """
        self.predictions = pd.Series(self.model.predict(data))
        return self.predictions
+
+
+class AutogluonAutoML:
+
+    ACCEPTED_MODEL_HYPERPAREMETERS = [
+        "output_filepath",
+        "problem_type",
+        "eval_metric",
+        "time_limit",
+        "presets",
+        "excluded_model_types",
+    ]
+
+    def load_model(self, path: Union[Path, str]) -> None:
+        """
+        Method to load a model
+        """
+        filepath = str(path)
+        self.model = TabularPredictor.load(path=filepath)
+
+    def save_model(self, path: Path) -> str:
+        """
+        Method to save a model
+        """
+        if self.model is None:
+            raise KeyError("No model trained/ loaded - unable to save")
+
+        logger.info("In local development mode - no need for s3 client")
+        logger.info("Using AutoGluon Model - Model saving already occured")
+
+        return str(path)
+
+    def train_model(
+        self, data: pd.DataFrame, target: str, model_hyperparameters: dict
+    ) -> None:
+        """
+        Method to train a model
+        """
+
+        validate_dict_keys(
+            keys_1=list(model_hyperparameters.keys()),
+            keys_2=self.ACCEPTED_MODEL_HYPERPAREMETERS,
+            config_type="Model Hyperparameters",
+        )
+
+        if model_hyperparameters["output_filepath"] is None:
+            logger.error("Please specify a output_filepath in order to train a model")
+            exit(1)
+
+        AGdata = TabularDataset(data=data)
+
+        self.model = TabularPredictor(
+            label=target,
+            path=model_hyperparameters["output_filepath"],
+            problem_type=model_hyperparameters["problem_type"],
+            eval_metric=model_hyperparameters["eval_metric"],
+        ).fit(
+            AGdata,
+            time_limit=model_hyperparameters["time_limit"],
+            presets=model_hyperparameters["presets"],
+            excluded_model_types=model_hyperparameters["excluded_model_types"],
+        )
+
+    def predict(self, data: pd.DataFrame) -> pd.Series:
+        """
+        Method to predict
+        """
+
+        if self.model is None:
+            print("No model loaded/ trained")
+            exit(1)
+
+        predictions = pd.Series(self.model.predict(data))
+
+        return predictions
--- a/modules/ml-pipeline/src/pipeline/src/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock
@ -23,8 +23,8 @@ stages:
    deps:
    - path: build_model.py
      hash: md5
-      md5: 58315ea127dcc127e2c22ab1205fddb2
-      size: 3925
+      md5: 662cd6b1562fbbc2c7d30dd0f2375a66
+      size: 3948
    - path: data/prepared_data
      hash: md5
      md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
@ -32,25 +32,32 @@ stages:
      nfiles: 2
    params:
      configs/build_model.yaml:
+        AutogluonAutoML:
+          output_filepath: ./data/model/autogluonmodel/
+          problem_type: regression
+          eval_metric: mean_absolute_error
+          time_limit: 200
+          presets: medium_quality
+          excluded_model_types:
        SKLearnLinearRegression:
        SKLearnSVMRegression:
          kernel: linear
-        model_save_filepath: ./data/model/model.joblib
-        model_type: SKLearnLinearRegression
+        model_save_filepath: ./data/model/autogluonmodel/
+        model_type: AutogluonAutoML
    outs:
    - path: data/model/
      hash: md5
-      md5: 40fa511f4f401f9d2c7da814afe198ef.dir
-      size: 920
-      nfiles: 1
+      md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
+      size: 1264795580
+      nfiles: 28
  generate_predictions:
    cmd: python generate_predictions.py
    deps:
    - path: data/model
      hash: md5
-      md5: 40fa511f4f401f9d2c7da814afe198ef.dir
-      size: 920
-      nfiles: 1
+      md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
+      size: 1264795580
+      nfiles: 28
    - path: data/prepared_data
      hash: md5
      md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
@ -58,8 +65,8 @@ stages:
      nfiles: 2
    - path: generate_predictions.py
      hash: md5
-      md5: 13e920c0bae8ac51dd907631578f7045
-      size: 4126
+      md5: 76c45e7575ec979e6c4c8e2cf754a720
+      size: 4225
    params:
      configs/generate_predictions.yaml:
        input_dataclient_type: local
@ -70,16 +77,16 @@ stages:
    outs:
    - path: data/predictions/
      hash: md5
-      md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir
-      size: 945933
+      md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
+      size: 672577
      nfiles: 1
  generate_metrics:
    cmd: python generate_metrics.py
    deps:
    - path: data/predictions
      hash: md5
-      md5: 01e8d3483e1f90b5d92022ee4a65bbd7.dir
-      size: 945933
+      md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
+      size: 672577
      nfiles: 1
    - path: data/prepared_data
      hash: md5
@ -88,8 +95,8 @@ stages:
      nfiles: 2
    - path: generate_metrics.py
      hash: md5
-      md5: 6276995b5e860d0f0bb4545aa5f5d347
-      size: 4259
+      md5: cc368845f62523575a9ed5c791e27815
+      size: 4329
    params:
      configs/generate_metrics.yaml:
        dataclient_type: local
@ -100,5 +107,16 @@ stages:
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: 995ccf3c6c3f6a975d22aa9bc9f4964e
-      size: 181
+      md5: 3f03e50a419af6730351a5016e2ae98a
+      size: 182
+  startup_cleanup:
+    cmd: python startup_cleanup.py
+    deps:
+    - path: startup_cleanup.py
+      hash: md5
+      md5: f7fe2ca33004b34530da0a3ab48c1790
+      size: 1458
+    params:
+      configs/startup_cleanup.yaml:
+        artefacts: ./data
+        metrics: ./metrics
--- a/modules/ml-pipeline/src/pipeline/src/dvc.yaml
+++ b/modules/ml-pipeline/src/pipeline/src/dvc.yaml
@ -1,4 +1,12 @@
 stages:
+  startup_cleanup:
+    cmd: python startup_cleanup.py
+    deps:
+    - startup_cleanup.py
+    params:
+    - configs/startup_cleanup.yaml:
+      - artefacts
+      - metrics
  prepare_data:
    cmd: python prepare_data.py
    deps:
--- a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py
+++ b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py
@ -77,7 +77,9 @@ def generate_predictions(
    logger.info("--- Saving predictions ---")
    logger.info("--------------------------")

-    predictions_df = pd.DataFrame(predictions, columns=[predictions_column_name])
+    predictions_df = pd.DataFrame(predictions)
+    predictions_df.columns = [predictions_column_name]
+
    datahandler.save_data(
        dataclient=output_dataclient,
        obj=predictions_df,
--- a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt
@ -1,7 +1,7 @@
 joblib==1.3.2
 boto3==1.28.17
 pandas==1.5.3
-scikit-learn==1.3.0
+autogluon==0.8.2
 pyarrow==13.0.0
 pre-commit==3.3.3
 sphinx==7.2.5
--- a/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py
+++ b/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py
@ -0,0 +1,50 @@
+"""
+We remove all previous artefacts in the data folder for a dvc run
+"""
+
+import shutil
+import yaml
+from pathlib import Path
+from core.Logger import logger
+
+startup_cleanup_path = Path(__file__).parent / "configs" / "startup_cleanup.yaml"
+startup_cleanup_params = yaml.safe_load(open(startup_cleanup_path))
+
+
+def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
+    """
+    Remove the directory where artefacts are stored
+    """
+    artefact_directory_path = Path(artefacts_directory)
+
+    if artefact_directory_path.exists():
+
+        logger.info(f"Removing the directory: {artefacts_directory}")
+        shutil.rmtree(artefact_directory_path)
+
+    metrics_directory_path = Path(metrics_directory)
+
+    if metrics_directory_path.exists():
+
+        logger.info(f"Removing the directory: {metrics_directory}")
+        shutil.rmtree(metrics_directory_path)
+
+
+if __name__ == "__main__":
+
+    logger.info("----------------------------")
+    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")
+
+    logger.info("---------------------")
+    logger.info(f"--- Run Clean up ---")
+    logger.info("---------------------")
+
+    run_cleanup(
+        artefacts_directory=startup_cleanup_params["artefacts"],
+        metrics_directory=startup_cleanup_params["metrics"],
+    )
+
+    logger.info("-------------------------------")
+    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")