use dynaconf to simplify configs

2026-07-27 22:45:04 +00:00 · 2023-09-29 11:37:36 +00:00 · 2023-09-29 11:37:36 +00:00 · ba592b36b7
commit ba592b36b7
parent 44d0e145f6
25 changed files with 255 additions and 304 deletions
--- a/modules/ml-pipeline/src/pipeline/.gitignore
+++ b/modules/ml-pipeline/src/pipeline/.gitignore
@ -0,0 +1,3 @@
+
+# Ignore dynaconf secret files
+.secrets.*
--- a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py
+++ b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py
@ -6,9 +6,9 @@ import shutil
 import yaml
 from pathlib import Path
 from core.Logger import logger
+from config import settings

-startup_cleanup_path = Path(__file__).parent / "configs" / "startup_cleanup.yaml"
-startup_cleanup_params = yaml.safe_load(open(startup_cleanup_path))
+startup_cleanup_params = settings.startup_cleanup


 def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
--- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py
+++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py
@ -15,6 +15,7 @@ from configs.feature_processor_logic import business_logic, new_feature_funcs
 from core.Logger import logger
 from core.DataClient import dataclient_factory
 from core.FeatureProcessor import feature_processor_factory
+from config import settings

 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
@ -22,14 +23,9 @@ logger.info("----------------------------")

 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")

-client_path = Path(__file__).parent / "configs" / "client.yaml"
-client_params = yaml.safe_load(open(client_path))
-
-prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
-prepare_data_params = yaml.safe_load(open(prepare_data_path))
-
-feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
-feature_process_params = yaml.safe_load(open(feature_process_path))
+client_params = settings.client
+prepare_data_params = settings.prepare_data
+feature_process_params = settings.feature_processor

 data_filepath = prepare_data_params["data_filepath"]
 train_proportion = prepare_data_params["train_proportion"]
--- a/modules/ml-pipeline/src/pipeline/2_build_model.py
+++ b/modules/ml-pipeline/src/pipeline/2_build_model.py
@ -16,6 +16,7 @@ from core.DataClient import dataclient_factory
 from core.MLModels import model_factory
 from core.MLMetrics import metrics_factory
 from configs.post_prediction_logic import post_prediction_logic
+from config import settings

 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
@ -23,17 +24,10 @@ logger.info("----------------------------")

 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")

-prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
-prepare_data_params = yaml.safe_load(open(prepare_data_path))
-
-build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
-build_model_params = yaml.safe_load(open(build_model_path))
-
-feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
-feature_process_params = yaml.safe_load(open(feature_process_path))
-
-generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
-generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
+prepare_data_params = settings.prepare_data
+build_model_params = settings.build_model
+feature_process_params = settings.feature_processor
+generate_metrics_params = settings.generate_metrics

 model_type = build_model_params["model_type"]
 target = feature_process_params["feature_processor_config"]["target"]
@ -149,8 +143,8 @@ if __name__ == "__main__":
        metrics=metrics,
        target=target,
        model_save_location=model_save_location,
-        model_hyperparameters=build_model_params[model_type],
-        train_filepath=model_hyperparameters,
+        model_hyperparameters=model_hyperparameters,
+        train_filepath=train_filepath,
        test_filepath=test_filepath,
        fit_metrics_filepath=fit_metrics_filepath,
    )
--- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
+++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
@ -13,7 +13,7 @@ from core.DataClient import dataclient_factory
 from core.MLModels import model_factory
 from core.Logger import logger
 from configs.post_prediction_logic import post_prediction_logic
-
+from config import settings

 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
@ -21,26 +21,20 @@ logger.info("----------------------------")

 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")

-client_path = Path(__file__).parent / "configs" / "client.yaml"
-client_params = yaml.safe_load(open(client_path))
+client_params = settings.client
+prepare_data_params = settings.prepare_data
+build_model_params = settings.build_model
+generate_predictions_params = settings.generate_predictions
+feature_process_params = settings.feature_processor

-prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
-prepare_data_params = yaml.safe_load(open(prepare_data_path))
+input_dataclient_type = generate_predictions_params["input_dataclient_type"]
+output_dataclient_type = generate_predictions_params["output_dataclient_type"]

-build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
-build_model_params = yaml.safe_load(open(build_model_path))
-
-generate_predictions_path = (
-    Path(__file__).parent / "configs" / "generate_predictions.yaml"
-)
-generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
-
-feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
-feature_process_params = yaml.safe_load(open(feature_process_path))
+test_data_filepath = generate_predictions_params["test_data_filepath"]
+test_data_filepath = os.environ.get("PREDICTION_FILE", test_data_filepath)

 target = feature_process_params["feature_processor_config"]["target"]
 model_filepath = build_model_params["model_save_filepath"]
-test_data_filepath = generate_predictions_params["test_data_filepath"]
 predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
 predictions_column_name = generate_predictions_params["predictions_column_name"]

@ -57,13 +51,11 @@ logger.info("----------------------------")
 # We may have different locations of loading hence why we use one specified in generate_predictions.yaml
 # I.e. for metric runs, this will be a local data client
 # For predictions, we will want a cloud data client
-input_dataclient_type = generate_predictions_params["input_dataclient_type"]
 input_dataclient = dataclient_factory(
    dataclient_type=input_dataclient_type,
    dataclient_config=client_params[input_dataclient_type],
 )

-output_dataclient_type = generate_predictions_params["output_dataclient_type"]
 output_dataclient = dataclient_factory(
    dataclient_type=output_dataclient_type,
    dataclient_config=client_params[output_dataclient_type],
--- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py
+++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py
@ -14,7 +14,7 @@ from core.DataClient import dataclient_factory
 from core.MLModels import model_factory
 from core.MLMetrics import metrics_factory
 from core.Logger import logger
-
+from config import settings

 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
@ -22,25 +22,12 @@ logger.info("----------------------------")

 RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")

-client_path = Path(__file__).parent / "configs" / "client.yaml"
-client_params = yaml.safe_load(open(client_path))
-
-prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
-prepare_data_params = yaml.safe_load(open(prepare_data_path))
-
-build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
-build_model_params = yaml.safe_load(open(build_model_path))
-
-generate_predictions_path = (
-    Path(__file__).parent / "configs" / "generate_predictions.yaml"
-)
-generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
-
-generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
-generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
-
-feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
-feature_process_params = yaml.safe_load(open(feature_process_path))
+client_params = settings.client
+prepare_data_params = settings.prepare_data
+build_model_params = settings.build_model
+generate_predictions_params = settings.generate_predictions
+generate_metrics_params = settings.generate_metrics
+feature_process_params = settings.feature_processor

 target = feature_process_params["feature_processor_config"]["target"]
 test_data_filepath = generate_predictions_params["test_data_filepath"]
--- a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
+++ b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
--- a/modules/ml-pipeline/src/pipeline/config.py
+++ b/modules/ml-pipeline/src/pipeline/config.py
@ -0,0 +1,14 @@
+from dynaconf import Dynaconf
+
+settings = Dynaconf(
+    environments=True,
+    envvar_prefix="DYNACONF",
+    settings_files=[
+        "./configs/settings.yaml",
+        "./configs/build_model.yaml",
+        "./configs/analysis.yaml",
+    ],
+)
+
+# `envvar_prefix` = export envvars with `export DYNACONF_FOO=bar`.
+# `settings_files` = Load these files in the order.
--- a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml
@ -0,0 +1,16 @@
+default:
+  model_analysis:
+    dataclient_type: local
+    feature_importance_filepath: ./analysis/feature_importance.parquet
+    permutation_subsample_amount: 1000
+    loss_fns: "mean_absolute_percentage_error"
+    feature_importance_column: importance
+    n_repeats: 5
+    figwidth: 7
+    figheight: 6
+
+  prediction_analysis:
+    dataclient_type: local
+    nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
+    n_val: 30  # how many datapoints from validation data should we interpret predictions for, larger values will be slower
+    row_index: [0, 10, 20] # index of an example datapoint
--- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
@ -1,16 +1,18 @@
-model_type: AutogluonAutoML
-model_save_filepath: ./data/model/autogluonmodel/
-fit_metrics_filepath: ./metrics/fit_metrics.json
+default:
+  build_model:
+    model_type: AutogluonAutoML
+    model_save_filepath: ./data/model/autogluonmodel/
+    fit_metrics_filepath: ./metrics/fit_metrics.json

-SKLearnLinearRegression: null
+    SKLearnLinearRegression: null

-SKLearnSVMRegression:
-  kernel: "linear"
+    SKLearnSVMRegression:
+      kernel: "linear"

-AutogluonAutoML:
-  output_filepath: ./data/model/autogluonmodel/
-  problem_type: regression
-  eval_metric: mean_absolute_error
-  time_limit: 800
-  presets: medium_quality
-  excluded_model_types: ['KNN']
+    AutogluonAutoML:
+      output_filepath: ./data/model/autogluonmodel/
+      problem_type: regression
+      eval_metric: mean_absolute_error
+      time_limit: 75
+      presets: medium_quality
+      excluded_model_types: ['KNN']
--- a/modules/ml-pipeline/src/pipeline/configs/client.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/client.yaml
@ -1,10 +0,0 @@
-aws-s3:
-  AWS_ACCESS_KEY_ID: null
-  AWS_SECRET_ACCESS_KEY: null
-  ENDPOINT_URL: null
-aws-s3-mock:
-  AWS_ACCESS_KEY_ID: minio
-  AWS_SECRET_ACCESS_KEY: minio123
-  ENDPOINT_URL: http://localhost:9000
-local:
-  null
--- a/modules/ml-pipeline/src/pipeline/configs/configs.py
+++ b/modules/ml-pipeline/src/pipeline/configs/configs.py
@ -1,3 +0,0 @@
-"""
-Stitch all yaml configuration files together, override some settings (such as bucket location) based off environment variables
-"""
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml
@ -1,61 +0,0 @@
-feature_processor_type: dataframe
-feature_processor_config:
-  subsample_amount: null
-  subsample_seed: 0
-  target: SAP_ENDING
-  drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
-  # retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
-  # retain_features: null
-#   retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
-#  'NUMBER_HEATED_ROOMS',
-#  'FIXED_LIGHTING_OUTLETS_COUNT',
-#  'CONSTRUCTION_AGE_BAND',
-#  'TRANSACTION_TYPE_STARTING',
-#  'LIGHTING_DESCRIPTION_STARTING',
-#  'MAINHEAT_DESCRIPTION_STARTING',
-#  'HOTWATER_DESCRIPTION_STARTING',
-#  'MAIN_FUEL_STARTING',
-#  'MECHANICAL_VENTILATION_STARTING',
-#  'SECONDHEAT_DESCRIPTION_STARTING',
-#  'ENERGY_TARIFF_STARTING',
-#  'SOLAR_WATER_HEATING_FLAG_STARTING',
-#  'PHOTO_SUPPLY_STARTING',
-#  'WINDOWS_DESCRIPTION_STARTING',
-#  'GLAZED_TYPE_STARTING',
-#  'MULTI_GLAZE_PROPORTION_STARTING',
-#  'LOW_ENERGY_LIGHTING_STARTING',
-#  'NUMBER_OPEN_FIREPLACES_STARTING',
-#  'MAINHEATCONT_DESCRIPTION_STARTING',
-#  'EXTENSION_COUNT_STARTING',
-#  'TOTAL_FLOOR_AREA_STARTING',
-#  'FLOOR_HEIGHT_STARTING',
-#  'DAYS_TO_STARTING',
-# 'WALLS_DESCRIPTION_STARTING',
-# 'FLOOR_DESCRIPTION_STARTING']
-#   retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
-#  'NUMBER_HEATED_ROOMS',
-#  'FIXED_LIGHTING_OUTLETS_COUNT',
-#  'CONSTRUCTION_AGE_BAND',
-#  'TRANSACTION_TYPE_ENDING',
-#  'LIGHTING_DESCRIPTION_ENDING',
-#  'MAINHEAT_DESCRIPTION_ENDING',
-#  'HOTWATER_DESCRIPTION_ENDING',
-#  'MAIN_FUEL_ENDING',
-#  'MECHANICAL_VENTILATION_ENDING',
-#  'SECONDHEAT_DESCRIPTION_ENDING',
-#  'ENERGY_TARIFF_ENDING',
-#  'SOLAR_WATER_HEATING_FLAG_ENDING',
-#  'PHOTO_SUPPLY_ENDING',
-#  'WINDOWS_DESCRIPTION_ENDING',
-#  'GLAZED_TYPE_ENDING',
-#  'MULTI_GLAZE_PROPORTION_ENDING',
-#  'LOW_ENERGY_LIGHTING_ENDING',
-#  'NUMBER_OPEN_FIREPLACES_ENDING',
-#  'MAINHEATCONT_DESCRIPTION_ENDING',
-#  'EXTENSION_COUNT_ENDING',
-#  'TOTAL_FLOOR_AREA_ENDING',
-#  'FLOOR_HEIGHT_ENDING',
-#  'DAYS_TO_ENDING',
-# 'WALLS_DESCRIPTION_ENDING',
-# 'FLOOR_DESCRIPTION_ENDING']
-  retain_features: null
--- a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml
@ -1,3 +0,0 @@
-dataclient_type: local
-metrics_type: Regression
-metrics_output_filepath: ./metrics/metrics.json
--- a/modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml
@ -1,5 +0,0 @@
-input_dataclient_type: local
-output_dataclient_type: local
-test_data_filepath: ./data/prepared_data/test.parquet
-predictions_output_filepath: ./data/predictions/predictions.parquet
-predictions_column_name: predictions
--- a/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml
@ -1,8 +0,0 @@
-dataclient_type: local
-feature_importance_filepath: ./analysis/feature_importance.parquet
-permutation_subsample_amount: 1000
-loss_fns: "mean_absolute_percentage_error"
-feature_importance_column: importance
-n_repeats: 5
-figwidth: 7
-figheight: 6
--- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
@ -11,6 +11,7 @@ def clip_predictions_to_minimum_value(
    series_name = predictions.name
    predictions.name = "predictions"
    predictions_df = pd.concat([data, predictions], axis=1)
+    # We expect all prediction to be atleast one point improvement
    replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
    predictions_df.loc[replace_index, "predictions"] = (
        predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
--- a/modules/ml-pipeline/src/pipeline/configs/prediction_analysis.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/prediction_analysis.yaml
@ -1,4 +0,0 @@
-dataclient_type: local
-nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
-n_val: 30  # how many datapoints from validation data should we interpret predictions for, larger values will be slower
-row_index: [0, 10, 20] # index of an example datapoint
--- a/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml
@ -1,9 +0,0 @@
-input_dataclient_type: aws-s3
-output_dataclient_type: local
-# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
-data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
-train_proportion: 0.9
-output_train_filepath: ./data/prepared_data/train.parquet
-output_test_filepath: ./data/prepared_data/test.parquet
-
-# cache_o
--- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml
@ -0,0 +1,49 @@
+default:
+  startup_cleanup:
+    artefacts: ./data
+    metrics: ./metrics
+
+  client:
+    aws-s3:
+      AWS_ACCESS_KEY_ID: null # Use local credentials
+      AWS_SECRET_ACCESS_KEY: null # Use local credentials
+      ENDPOINT_URL: null # Use local credentials
+    aws-s3-mock:
+      AWS_ACCESS_KEY_ID: minio
+      AWS_SECRET_ACCESS_KEY: minio123
+      ENDPOINT_URL: http://localhost:9000
+    local:
+      null
+
+  prepare_data:
+    input_dataclient_type: aws-s3
+    output_dataclient_type: local
+    data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
+    train_proportion: 0.9
+    output_train_filepath: ./data/prepared_data/train.parquet
+    output_test_filepath: ./data/prepared_data/test.parquet
+
+  feature_processor:
+    feature_processor_type: dataframe
+    feature_processor_config:
+      subsample_amount: null
+      subsample_seed: 0
+      target: SAP_ENDING
+      drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
+      retain_features: null
+
+  generate_predictions:
+    input_dataclient_type: local
+    output_dataclient_type: local
+    test_data_filepath: ./data/prepared_data/test.parquet
+    predictions_output_filepath: ./data/predictions/predictions.parquet
+    predictions_column_name: predictions
+
+  generate_metrics:
+    dataclient_type: local
+    metrics_type: Regression
+    metrics_output_filepath: ./metrics/metrics.json
+
+dev:
+  generate_predictions:
+      input_dataclient_type: aws-s3
--- a/modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml
@ -1,2 +0,0 @@
-artefacts: ./data
-metrics: ./metrics
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@ -5,122 +5,139 @@ stages:
    deps:
    - path: 1_prepare_data.py
      hash: md5
-      md5: 2648d7d407dca857a1d20a11a88d3d98
-      size: 5116
+      md5: c9f030df733e318b80d1fa91b7732f79
+      size: 5132
    params:
-      configs/prepare_data.yaml:
-        output_test_filepath: ./data/prepared_data/test.parquet
-        output_train_filepath: ./data/prepared_data/train.parquet
-        train_proportion: 0.9
+      configs/settings.yaml:
+        default.feature_processor.feature_processor_config.drop_columns:
+        - UPRN
+        - HEAT_DEMAND_CHANGE
+        - CARBON_CHANGE
+        - RDSAP_CHANGE
+        - HEAT_DEMAND_ENDING
+        - CARBON_ENDING
+        default.feature_processor.feature_processor_config.retain_features:
+        default.feature_processor.feature_processor_config.subsample_amount:
+        default.feature_processor.feature_processor_config.subsample_seed: 0
+        default.feature_processor.feature_processor_config.target: SAP_ENDING
+        default.feature_processor.feature_processor_type: dataframe
+        default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
+        default.prepare_data.input_dataclient_type: aws-s3
+        default.prepare_data.output_dataclient_type: local
+        default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
+        default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
+        default.prepare_data.train_proportion: 0.9
    outs:
    - path: data/prepared_data/
      hash: md5
-      md5: 7bcbf81a82015276e25749d1bc249a57.dir
-      size: 21076961
+      md5: ed19a11a85d6a2006631173f51569d27.dir
+      size: 21131576
      nfiles: 2
  build_model:
    cmd: python 2_build_model.py
    deps:
    - path: 2_build_model.py
      hash: md5
-      md5: 3eb1a5110df6e25a23d8e8a92bb27823
-      size: 5257
+      md5: 039578b629d7cd204016e92cd079ea90
+      size: 5181
    - path: data/prepared_data
      hash: md5
-      md5: 7bcbf81a82015276e25749d1bc249a57.dir
-      size: 21076961
+      md5: ed19a11a85d6a2006631173f51569d27.dir
+      size: 21131576
      nfiles: 2
    params:
      configs/build_model.yaml:
-        AutogluonAutoML:
-          output_filepath: ./data/model/autogluonmodel/
-          problem_type: regression
-          eval_metric: mean_absolute_error
-          time_limit: 800
-          presets: medium_quality
-          excluded_model_types:
-          - KNN
-        SKLearnLinearRegression:
-        SKLearnSVMRegression:
-          kernel: linear
-        fit_metrics_filepath: ./metrics/fit_metrics.json
-        model_save_filepath: ./data/model/autogluonmodel/
-        model_type: AutogluonAutoML
+        default:
+          build_model:
+            model_type: AutogluonAutoML
+            model_save_filepath: ./data/model/autogluonmodel/
+            fit_metrics_filepath: ./metrics/fit_metrics.json
+            SKLearnLinearRegression:
+            SKLearnSVMRegression:
+              kernel: linear
+            AutogluonAutoML:
+              output_filepath: ./data/model/autogluonmodel/
+              problem_type: regression
+              eval_metric: mean_absolute_error
+              time_limit: 75
+              presets: medium_quality
+              excluded_model_types:
+              - KNN
    outs:
    - path: data/model/
      hash: md5
-      md5: 397c46c062b51034b6f8f3f229345de3.dir
-      size: 334481421
-      nfiles: 18
+      md5: 60e253c42cc36934098c627ef3ef4cc1.dir
+      size: 185134993
+      nfiles: 14
    - path: metrics/fit_metrics.json
      hash: md5
-      md5: f6e7e21d4229d4a229ea0a11f3023637
-      size: 184
+      md5: a0c2a1c9e5da0b857d510fa1ba6282a8
+      size: 186
  generate_predictions:
    cmd: python 3_generate_predictions.py
    deps:
-    - path: data/model
-      hash: md5
-      md5: 397c46c062b51034b6f8f3f229345de3.dir
-      size: 334481421
-      nfiles: 18
-    - path: data/prepared_data
-      hash: md5
-      md5: 7bcbf81a82015276e25749d1bc249a57.dir
-      size: 21076961
-      nfiles: 2
    - path: 3_generate_predictions.py
      hash: md5
-      md5: 874da2443ef0d92731e4c127f3ce4acb
-      size: 4434
+      md5: 238b3fa9f3c6f3720e77c116857070ae
+      size: 4720
+    - path: data/model
+      hash: md5
+      md5: 60e253c42cc36934098c627ef3ef4cc1.dir
+      size: 185134993
+      nfiles: 14
+    - path: data/prepared_data
+      hash: md5
+      md5: ed19a11a85d6a2006631173f51569d27.dir
+      size: 21131576
+      nfiles: 2
    params:
-      configs/generate_predictions.yaml:
-        input_dataclient_type: local
-        output_dataclient_type: local
-        predictions_column_name: predictions
-        predictions_output_filepath: ./data/predictions/predictions.parquet
-        test_data_filepath: ./data/prepared_data/test.parquet
+      configs/settings.yaml:
+        default.generate_predictions.input_dataclient_type: local
+        default.generate_predictions.output_dataclient_type: local
+        default.generate_predictions.predictions_column_name: predictions
+        default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet
+        default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet
    outs:
    - path: data/predictions/
      hash: md5
-      md5: 9c18005e722f0e428f4b83c3f974f206.dir
-      size: 381870
+      md5: 700c8767de3a86c4c5339daf3cc17192.dir
+      size: 380962
      nfiles: 1
  generate_metrics:
    cmd: python 4_generate_metrics.py
    deps:
+    - path: 4_generate_metrics.py
+      hash: md5
+      md5: 2c9fb78955a8c19cff0a098976f81d1b
+      size: 4487
    - path: data/predictions
      hash: md5
-      md5: 9c18005e722f0e428f4b83c3f974f206.dir
-      size: 381870
+      md5: 700c8767de3a86c4c5339daf3cc17192.dir
+      size: 380962
      nfiles: 1
    - path: data/prepared_data
      hash: md5
-      md5: 7bcbf81a82015276e25749d1bc249a57.dir
-      size: 21076961
+      md5: ed19a11a85d6a2006631173f51569d27.dir
+      size: 21131576
      nfiles: 2
-    - path: 4_generate_metrics.py
-      hash: md5
-      md5: 8ce0b6b55e1688fca816985e0cf37f28
-      size: 4220
    params:
-      configs/generate_metrics.yaml:
-        dataclient_type: local
-        metrics_output_filepath: ./metrics/metrics.json
-        metrics_type: Regression
+      configs/settings.yaml:
+        default.generate_metrics.dataclient_type: local
+        default.generate_metrics.metrics_output_filepath: ./metrics/metrics.json
+        default.generate_metrics.metrics_type: Regression
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: 93d9b69d6cd951ae2c14b29ba92a2a38
-      size: 186
+      md5: 45ffac8f6e7283df4b69af8a9abc45e1
+      size: 184
  startup_cleanup:
    cmd: python 0_startup_cleanup.py
    deps:
    - path: 0_startup_cleanup.py
      hash: md5
-      md5: 2e51fbcac960d0f960bf32a8ec7486a0
-      size: 1748
+      md5: fbb7e3b1b98b517c870f3e1df3e7f695
+      size: 1676
    params:
-      configs/startup_cleanup.yaml:
-        artefacts: ./data
-        metrics: ./metrics
+      configs/settings.yaml:
+        default.startup_cleanup.artefacts: ./data
+        default.startup_cleanup.metrics: ./metrics
--- a/modules/ml-pipeline/src/pipeline/dvc.yaml
+++ b/modules/ml-pipeline/src/pipeline/dvc.yaml
@ -4,19 +4,28 @@ stages:
    deps:
    - 0_startup_cleanup.py
    params:
-    - configs/startup_cleanup.yaml:
-      - artefacts
-      - metrics
+    - configs/settings.yaml:
+      - default.startup_cleanup.artefacts
+      - default.startup_cleanup.metrics
    always_changed: true
  prepare_data:
    cmd: python 1_prepare_data.py
    deps:
    - 1_prepare_data.py
    params:
-    - configs/prepare_data.yaml:
-      - output_test_filepath
-      - output_train_filepath
-      - train_proportion
+    - configs/settings.yaml:
+      - default.prepare_data.input_dataclient_type
+      - default.prepare_data.output_dataclient_type
+      - default.prepare_data.data_filepath
+      - default.prepare_data.train_proportion
+      - default.prepare_data.output_train_filepath
+      - default.prepare_data.output_test_filepath
+      - default.feature_processor.feature_processor_type
+      - default.feature_processor.feature_processor_config.subsample_amount
+      - default.feature_processor.feature_processor_config.subsample_seed
+      - default.feature_processor.feature_processor_config.target
+      - default.feature_processor.feature_processor_config.drop_columns
+      - default.feature_processor.feature_processor_config.retain_features
    outs:
    - data/prepared_data/
    always_changed: true
@ -38,7 +47,12 @@ stages:
    - data/prepared_data
    - data/model
    params:
-    - configs/generate_predictions.yaml:
+    - configs/settings.yaml:
+      - default.generate_predictions.input_dataclient_type
+      - default.generate_predictions.output_dataclient_type
+      - default.generate_predictions.test_data_filepath
+      - default.generate_predictions.predictions_output_filepath
+      - default.generate_predictions.predictions_column_name
    outs:
    - data/predictions/
    always_changed: true
@ -49,7 +63,10 @@ stages:
    - data/prepared_data
    - data/predictions
    params:
-    - configs/generate_metrics.yaml:
+    - configs/settings.yaml:
+      - default.generate_metrics.dataclient_type
+      - default.generate_metrics.metrics_type
+      - default.generate_metrics.metrics_output_filepath
    outs:
    - metrics/metrics.json
    always_changed: true
--- a/modules/ml-pipeline/src/pipeline/model_analysis.py
+++ b/modules/ml-pipeline/src/pipeline/model_analysis.py
@ -3,8 +3,6 @@ Post Model generation step:
 We want to look at feature analysis of the model
 """

-import yaml
-from pathlib import Path
 from core.interface.InterfaceModels import MLModel
 from core.interface.InterfaceDataClient import DataClient
 from core.Logger import logger
@ -13,27 +11,16 @@ from core.DataClient import dataclient_factory
 from alibi.explainers import PermutationImportance, plot_permutation_importance
 import numpy as np
 import pandas as pd
+from config import settings


-client_path = Path(__file__).parent / "configs" / "client.yaml"
-client_params = yaml.safe_load(open(client_path))
+client_params = settings.client
+prepare_data_params = settings.prepare_data
+feature_process_params = settings.feature_processor
+build_model_params = settings.build_model
+generate_predictions_params = settings.generate_predictions

-prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
-prepare_data_params = yaml.safe_load(open(prepare_data_path))
-
-feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
-feature_process_params = yaml.safe_load(open(feature_process_path))
-
-build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
-build_model_params = yaml.safe_load(open(build_model_path))
-
-model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
-model_analysis_params = yaml.safe_load(open(model_analysis_path))
-
-generate_predictions_path = (
-    Path(__file__).parent / "configs" / "generate_predictions.yaml"
-)
-generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
+model_analysis_params = settings.model_analysis

 model = model_factory(build_model_params["model_type"])
 model.load_model(build_model_params["model_save_filepath"])
--- a/modules/ml-pipeline/src/pipeline/prediction_analysis.py
+++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py
@ -12,40 +12,21 @@ import shap

 shap.initjs()

-
-import yaml
 from typing import List
-from pathlib import Path
 from core.interface.InterfaceModels import MLModel
 from core.interface.InterfaceDataClient import DataClient
 from core.Logger import logger
 from core.MLModels import model_factory
 from core.DataClient import dataclient_factory
-import numpy as np
 import pandas as pd
+from config import settings

-
-client_path = Path(__file__).parent / "configs" / "client.yaml"
-client_params = yaml.safe_load(open(client_path))
-
-prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
-prepare_data_params = yaml.safe_load(open(prepare_data_path))
-
-feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
-feature_process_params = yaml.safe_load(open(feature_process_path))
-
-build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
-build_model_params = yaml.safe_load(open(build_model_path))
-
-generate_predictions_path = (
-    Path(__file__).parent / "configs" / "generate_predictions.yaml"
-)
-generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
-
-prediction_analysis_path = (
-    Path(__file__).parent / "configs" / "prediction_analysis.yaml"
-)
-prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path))
+client_params = settings.client
+prepare_data_params = settings.prepare_data
+feature_process_params = settings.feature_processor
+build_model_params = settings.build_model
+generate_predictions_params = settings.generate_predictions
+prediction_analysis_params = settings.prediction_analysis

 model = model_factory(build_model_params["model_type"])
 model.load_model(build_model_params["model_save_filepath"])