Merge pull request #37 from Hestia-Homes/model-test

Model test
2026-07-27 22:45:04 +00:00 · 2023-09-21 23:00:20 +01:00 · 2023-09-21 23:00:20 +01:00 · 7729f96903
commit 7729f96903
parent 8cfa9a6eb1 1e70a3a582
17 changed files with 556 additions and 50 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -14,6 +14,6 @@ repos:
    hooks:
    -   id: dvc-push-experiment
        name: DVC - Push to experiment to remote location (experiments)
-        entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"'
+        entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"'
        language: system
        verbose: true
--- a/modules/ml-pipeline/.gitignore
+++ b/modules/ml-pipeline/.gitignore
@ -1,4 +1,5 @@
 .dev_env/
+.dev_env_pipeline/
 __pycache__/
 .DS_Store
 .vscode/
--- a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
+++ b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
--- a/modules/ml-pipeline/src/pipeline/build_model.py
+++ b/modules/ml-pipeline/src/pipeline/build_model.py
@ -68,13 +68,13 @@ def build_model(
        data=train_data, target=target, model_hyperparameters=model_hyperparameters
    )

-    logger.info("------------------------------")
-    logger.info("--- Generating predictions ---")
-    logger.info("------------------------------")
+    logger.info("----------------------------------")
+    logger.info("--- Generating fit predictions ---")
+    logger.info("----------------------------------")

    prediction_data = train_data.drop(columns=target)

-    predictions = model.predict(data=prediction_data)
+    fit_predictions = model.predict(data=prediction_data)

    logger.info("------------------------------")
    logger.info("--- Generating fit metrics ---")
@ -82,7 +82,7 @@ def build_model(

    metrics_output = metrics.generate_metrics(
        target=train_data[target],
-        predictions=pd.Series(predictions),
+        predictions=pd.Series(fit_predictions),
    )

    logger.info("--------------------")
--- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
@ -1,5 +1,5 @@
-model_type: SKLearnLinearRegression
-model_save_filepath: ./data/model/model.joblib
+model_type: AutogluonAutoML
+model_save_filepath: ./data/model/autogluonmodel/
 fit_metrics_filepath: ./metrics/fit_metrics.json

 SKLearnLinearRegression: null
@ -12,5 +12,5 @@ AutogluonAutoML:
  problem_type: regression
  eval_metric: mean_absolute_error
  time_limit: 400
-  presets: high_quality
+  presets: good_quality
  excluded_model_types: ['KNN']
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml
@ -2,7 +2,59 @@ feature_processor_type: dataframe
 feature_processor_config:
  subsample_amount: null
  subsample_seed: 0
-  target: RDSAP_CHANGE
-  drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"]
-  retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
+  target: SAP_ENDING
+  drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE"]
+  # retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
  # retain_features: null
+#   retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
+#  'NUMBER_HEATED_ROOMS',
+#  'FIXED_LIGHTING_OUTLETS_COUNT',
+#  'CONSTRUCTION_AGE_BAND',
+#  'TRANSACTION_TYPE_STARTING',
+#  'LIGHTING_DESCRIPTION_STARTING',
+#  'MAINHEAT_DESCRIPTION_STARTING',
+#  'HOTWATER_DESCRIPTION_STARTING',
+#  'MAIN_FUEL_STARTING',
+#  'MECHANICAL_VENTILATION_STARTING',
+#  'SECONDHEAT_DESCRIPTION_STARTING',
+#  'ENERGY_TARIFF_STARTING',
+#  'SOLAR_WATER_HEATING_FLAG_STARTING',
+#  'PHOTO_SUPPLY_STARTING',
+#  'WINDOWS_DESCRIPTION_STARTING',
+#  'GLAZED_TYPE_STARTING',
+#  'MULTI_GLAZE_PROPORTION_STARTING',
+#  'LOW_ENERGY_LIGHTING_STARTING',
+#  'NUMBER_OPEN_FIREPLACES_STARTING',
+#  'MAINHEATCONT_DESCRIPTION_STARTING',
+#  'EXTENSION_COUNT_STARTING',
+#  'TOTAL_FLOOR_AREA_STARTING',
+#  'FLOOR_HEIGHT_STARTING',
+#  'DAYS_TO_STARTING',
+# 'WALLS_DESCRIPTION_STARTING',
+# 'FLOOR_DESCRIPTION_STARTING']
+  retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
+ 'NUMBER_HEATED_ROOMS',
+ 'FIXED_LIGHTING_OUTLETS_COUNT',
+ 'CONSTRUCTION_AGE_BAND',
+ 'TRANSACTION_TYPE_ENDING',
+ 'LIGHTING_DESCRIPTION_ENDING',
+ 'MAINHEAT_DESCRIPTION_ENDING',
+ 'HOTWATER_DESCRIPTION_ENDING',
+ 'MAIN_FUEL_ENDING',
+ 'MECHANICAL_VENTILATION_ENDING',
+ 'SECONDHEAT_DESCRIPTION_ENDING',
+ 'ENERGY_TARIFF_ENDING',
+ 'SOLAR_WATER_HEATING_FLAG_ENDING',
+ 'PHOTO_SUPPLY_ENDING',
+ 'WINDOWS_DESCRIPTION_ENDING',
+ 'GLAZED_TYPE_ENDING',
+ 'MULTI_GLAZE_PROPORTION_ENDING',
+ 'LOW_ENERGY_LIGHTING_ENDING',
+ 'NUMBER_OPEN_FIREPLACES_ENDING',
+ 'MAINHEATCONT_DESCRIPTION_ENDING',
+ 'EXTENSION_COUNT_ENDING',
+ 'TOTAL_FLOOR_AREA_ENDING',
+ 'FLOOR_HEIGHT_ENDING',
+ 'DAYS_TO_ENDING',
+'WALLS_DESCRIPTION_ENDING',
+'FLOOR_DESCRIPTION_ENDING']
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
@ -10,4 +10,10 @@ business_logic = {}
 """
 New features dict + function
 """
-new_feature_funcs = {}
+
+
+def SAP_ENDING(df):
+    return df["SAP_STARTING"] + df["RDSAP_CHANGE"]
+
+
+new_feature_funcs = {"SAP_ENDING": SAP_ENDING}
--- a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml
@ -1,5 +1,3 @@
 dataclient_type: local
-input_datahandler_type: parquet
-output_datahandler_type: json
 metrics_type: Regression
 metrics_output_filepath: ./metrics/metrics.json
--- a/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml
@ -0,0 +1,8 @@
+dataclient_type: local
+feature_importance_filepath: ./analysis/feature_importance.parquet
+permutation_subsample_amount: 1000
+loss_fns: "mean_absolute_percentage_error"
+feature_importance_column: importance
+n_repeats: 5
+figwidth: 7
+figheight: 6
--- a/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml
@ -1,6 +1,5 @@
 input_dataclient_type: aws-s3
 output_dataclient_type: local
-datahandler_type: parquet
 data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
 train_proportion: 0.9
 output_train_filepath: ./data/prepared_data/train.parquet
--- a/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py
+++ b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py
@ -134,6 +134,8 @@ class DataFrameFeatureProcessor:
            subsample_amount=feature_processor_config["subsample_amount"],
            subsample_seed=feature_processor_config["subsample_seed"],
        )
+        df = self.apply_business_logic(df, business_logic=business_logic)
+        df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
        df = self.drop_unused_columns(
            df, drop_columns=feature_processor_config["drop_columns"]
        )
@ -142,6 +144,4 @@ class DataFrameFeatureProcessor:
            retain_features=feature_processor_config["retain_features"],
            target=feature_processor_config["target"],
        )
-        df = self.apply_business_logic(df, business_logic=business_logic)
-        df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
        return df
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@ -5,8 +5,8 @@ stages:
    deps:
    - path: prepare_data.py
      hash: md5
-      md5: 7531a931a405650dc4e8b5d8c1fd3c66
-      size: 4959
+      md5: 934d774e67f38e440b621ce71152f5f6
+      size: 5031
    params:
      configs/prepare_data.yaml:
        output_test_filepath: ./data/prepared_data/test.parquet
@ -15,20 +15,20 @@ stages:
    outs:
    - path: data/prepared_data/
      hash: md5
-      md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
-      size: 13238511
+      md5: 3767eec56906f5ac724a3f07433645ef.dir
+      size: 13442342
      nfiles: 2
  build_model:
    cmd: python build_model.py
    deps:
    - path: build_model.py
      hash: md5
-      md5: c07ce0b8fdaf337ddfb7115684932157
-      size: 5048
+      md5: f9fa2a66d908b42ae196ce6f0f782258
+      size: 5134
    - path: data/prepared_data
      hash: md5
-      md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
-      size: 13238511
+      md5: 3767eec56906f5ac724a3f07433645ef.dir
+      size: 13442342
      nfiles: 2
    params:
      configs/build_model.yaml:
@ -37,42 +37,42 @@ stages:
          problem_type: regression
          eval_metric: mean_absolute_error
          time_limit: 400
-          presets: high_quality
+          presets: good_quality
          excluded_model_types:
          - KNN
        SKLearnLinearRegression:
        SKLearnSVMRegression:
          kernel: linear
        fit_metrics_filepath: ./metrics/fit_metrics.json
-        model_save_filepath: ./data/model/model.joblib
-        model_type: SKLearnLinearRegression
+        model_save_filepath: ./data/model/autogluonmodel/
+        model_type: AutogluonAutoML
    outs:
    - path: data/model/
      hash: md5
-      md5: 2ace0835c28543512982b69d383b3c49.dir
-      size: 1832
-      nfiles: 1
+      md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir
+      size: 118227750
+      nfiles: 71
    - path: metrics/fit_metrics.json
      hash: md5
-      md5: c8c5a40863e2ced7f5f5a844ba203d80
-      size: 180
+      md5: e1c9a16617804f48e8ffac7cec6575ca
+      size: 185
  generate_predictions:
    cmd: python generate_predictions.py
    deps:
    - path: data/model
      hash: md5
-      md5: 2ace0835c28543512982b69d383b3c49.dir
-      size: 1832
-      nfiles: 1
+      md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir
+      size: 118227750
+      nfiles: 71
    - path: data/prepared_data
      hash: md5
-      md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
-      size: 13238511
+      md5: 3767eec56906f5ac724a3f07433645ef.dir
+      size: 13442342
      nfiles: 2
    - path: generate_predictions.py
      hash: md5
-      md5: ab603e9a526a73f2fe17603e6fe6c0a4
-      size: 4261
+      md5: a25c4611ff467cdc1c921918112a30fe
+      size: 4311
    params:
      configs/generate_predictions.yaml:
        input_dataclient_type: local
@ -83,26 +83,26 @@ stages:
    outs:
    - path: data/predictions/
      hash: md5
-      md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
-      size: 643838
+      md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir
+      size: 536774
      nfiles: 1
  generate_metrics:
    cmd: python generate_metrics.py
    deps:
    - path: data/predictions
      hash: md5
-      md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
-      size: 643838
+      md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir
+      size: 536774
      nfiles: 1
    - path: data/prepared_data
      hash: md5
-      md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
-      size: 13238511
+      md5: 3767eec56906f5ac724a3f07433645ef.dir
+      size: 13442342
      nfiles: 2
    - path: generate_metrics.py
      hash: md5
-      md5: 78a9b9b25d0a7deaf44277f9afad5f98
-      size: 4139
+      md5: 8ce0b6b55e1688fca816985e0cf37f28
+      size: 4220
    params:
      configs/generate_metrics.yaml:
        dataclient_type: local
@ -113,7 +113,7 @@ stages:
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: f494881710a057f90f82c0bd3a40a41d
+      md5: 852ef4cf2ca5e7f89d70420a9df7a596
      size: 183
  startup_cleanup:
    cmd: python startup_cleanup.py
--- a/modules/ml-pipeline/src/pipeline/eda.py
+++ b/modules/ml-pipeline/src/pipeline/eda.py
@ -0,0 +1,177 @@
+"""
+Doing some eda on dataset
+"""
+# Look at response variable
+
+from matplotlib import pyplot as plt
+import pandas as pd
+
+train_df = pd.read_parquet("./data/prepared_data/train.parquet")
+target = "SAP_ENDING"
+
+train_df = train_df.head(10000)
+
+# train_df[target].plot(kind='hist')
+
+# Plot the target variable
+fig, ax = plt.subplots(figsize=(10, 7))
+ax.hist(train_df[target], bins=range(min(train_df[target]), max(train_df[target])))
+
+fig
+
+# Find correlation to sale price (numeric)
+train_df.dtypes
+# All numerical
+
+train_df_corr = train_df.corr()
+
+train_df_corr.style.background_gradient(cmap="coolwarm")
+
+train_df_corr["EXTENSION_COUNT_ENDING"]
+
+# Check out some correlation plots between variables
+# sap starting - negative correlation
+
+train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
+
+# head demand - light positive correlation
+train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
+    x=target, y="HEAT_DEMAND_STARTING", style="o"
+)
+
+# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
+
+# Load the autogluon model and check feature importance
+
+
+import os
+import yaml
+import pandas as pd
+from pathlib import Path
+from core.interface.InterfaceModels import MLModel
+from core.interface.InterfaceDataClient import DataClient
+from core.DataClient import dataclient_factory
+from core.MLModels import model_factory
+from core.Logger import logger
+
+
+RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
+
+client_path = Path(__file__).parent / "configs" / "client.yaml"
+client_params = yaml.safe_load(open(client_path))
+
+prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
+prepare_data_params = yaml.safe_load(open(prepare_data_path))
+
+build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
+build_model_params = yaml.safe_load(open(build_model_path))
+
+generate_predictions_path = (
+    Path(__file__).parent / "configs" / "generate_predictions.yaml"
+)
+generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
+
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
+model = model_factory(build_model_params["model_type"])
+model_filepath = build_model_params["model_save_filepath"]
+
+model.load_model(model_filepath)
+
+fi = model.model.feature_importance(train_df.reset_index(drop=True))
+
+pred = pd.read_parquet("./data/predictions/predictions.parquet")
+test_df = pd.read_parquet("./data/prepared_data/test.parquet")
+
+# test_df = test_df.head(1000)
+
+test_df["predictions"] = pred["predictions"]
+
+test_df.groupby("PROPERTY_TYPE").apply(
+    lambda x: (x.SAP_ENDING - x.predictions).abs().mean()
+)
+
+test_df.head()
+flat_df = test_df[test_df["PROPERTY_TYPE"] == "Flat"]
+
+flat_df["residual"] = abs(flat_df["predictions"] - flat_df[target])
+
+generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
+generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
+from core.MLMetrics import metrics_factory
+
+metrics = metrics_factory(generate_metrics_params["metrics_type"])
+
+metrics_output = metrics.generate_metrics(
+    target=flat_df[target],
+    predictions=pd.Series(flat_df["predictions"]),
+)
+
+# Use alibi to run permutation importance
+
+from alibi.explainers import PermutationImportance, plot_permutation_importance
+from sklearn.metrics import mean_absolute_percentage_error
+import numpy as np
+import pandas as pd
+
+test_df = pd.read_parquet("./data/prepared_data/test.parquet")
+test_df = test_df.head(1000)
+
+target = "SAP_ENDING"
+feature_names = test_df.columns.to_list()
+feature_names.remove(target)
+
+x = test_df[feature_names].to_numpy()
+y = test_df[target].to_numpy()
+
+
+def predict_fn(X: np.ndarray) -> np.ndarray:
+    return model.predict(pd.DataFrame(X, columns=feature_names))
+
+
+pfi = PermutationImportance(
+    predictor=predict_fn,
+    loss_fns=mean_absolute_percentage_error,
+    feature_names=feature_names,
+    verbose=True,
+)
+
+exp = pfi.explain(x, y)
+plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
+
+[
+    "PROPERTY_TYPE",
+    "BUILT_FORM",
+    "CONSTITUENCY",
+    "NUMBER_HABITABLE_ROOMS",
+    "NUMBER_HEATED_ROOMS",
+    "FIXED_LIGHTING_OUTLETS_COUNT",
+    "CONSTRUCTION_AGE_BAND",
+    "TRANSACTION_TYPE_STARTING",
+    "LIGHTING_DESCRIPTION_STARTING",
+    "MAINHEAT_DESCRIPTION_STARTING",
+    "HOTWATER_DESCRIPTION_STARTING",
+    "MAIN_FUEL_STARTING",
+    "MECHANICAL_VENTILATION_STARTING",
+    "SECONDHEAT_DESCRIPTION_STARTING",
+    "ENERGY_TARIFF_STARTING",
+    "SOLAR_WATER_HEATING_FLAG_STARTING",
+    "PHOTO_SUPPLY_STARTING",
+    "WINDOWS_DESCRIPTION_STARTING",
+    "GLAZED_TYPE_STARTING",
+    "MULTI_GLAZE_PROPORTION_STARTING",
+    "LOW_ENERGY_LIGHTING_STARTING",
+    "NUMBER_OPEN_FIREPLACES_STARTING",
+    "MAINHEATCONT_DESCRIPTION_STARTING",
+    "EXTENSION_COUNT_STARTING",
+    "TOTAL_FLOOR_AREA_STARTING",
+    "FLOOR_HEIGHT_STARTING",
+    "DAYS_TO_STARTING",
+    "WALLS_DESCRIPTION_STARTING",
+    "FLOOR_DESCRIPTION_STARTING",
+]
+
+# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
+#
+#
--- a/modules/ml-pipeline/src/pipeline/model_analysis.py
+++ b/modules/ml-pipeline/src/pipeline/model_analysis.py
@ -0,0 +1,150 @@
+"""
+Post Model generation step:
+We want to look at feature analysis of the model
+"""
+
+import yaml
+from pathlib import Path
+from core.interface.InterfaceModels import MLModel
+from core.interface.InterfaceDataClient import DataClient
+from core.Logger import logger
+from core.MLModels import model_factory
+from core.DataClient import dataclient_factory
+from alibi.explainers import PermutationImportance, plot_permutation_importance
+import numpy as np
+import pandas as pd
+
+
+client_path = Path(__file__).parent / "configs" / "client.yaml"
+client_params = yaml.safe_load(open(client_path))
+
+prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
+prepare_data_params = yaml.safe_load(open(prepare_data_path))
+
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
+build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
+build_model_params = yaml.safe_load(open(build_model_path))
+
+model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
+model_analysis_params = yaml.safe_load(open(model_analysis_path))
+
+generate_predictions_path = (
+    Path(__file__).parent / "configs" / "generate_predictions.yaml"
+)
+generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
+
+model = model_factory(build_model_params["model_type"])
+model.load_model(build_model_params["model_save_filepath"])
+
+dataclient_type = model_analysis_params["dataclient_type"]
+dataclient = dataclient_factory(
+    dataclient_type=dataclient_type,
+    dataclient_config=client_params[dataclient_type],
+)
+
+
+feature_importance_filepath = model_analysis_params["feature_importance_filepath"]
+permutation_subsample_amount = model_analysis_params["permutation_subsample_amount"]
+loss_fns = model_analysis_params["loss_fns"]
+feature_importance_column = model_analysis_params["feature_importance_column"]
+n_repeats = model_analysis_params["n_repeats"]
+figwidth = model_analysis_params["figwidth"]
+figheight = model_analysis_params["figheight"]
+target = feature_process_params["feature_processor_config"]["target"]
+output_test_filepath = prepare_data_params["output_test_filepath"]
+
+
+def model_analysis(
+    model: MLModel,
+    dataclient: DataClient,
+    target: str,
+    output_test_filepath: str,
+    feature_importance_filepath: str,
+    permutation_subsample_amount: int = 100,
+    loss_fns: str = "mean_absolute_percentage_error",
+    feature_importance_column: str = "importance",
+    n_repeats: int = 5,
+    figwidth: int = 7,
+    figheight: int = 6,
+):
+    """
+    Key task is to take in a model and generate:
+    - feature importance
+    and save these outputs
+    """
+
+    logger.info("------------------------------------")
+    logger.info(f"--- Generate Feature Importance ---")
+    logger.info("------------------------------------")
+
+    test_df = dataclient.load_data(output_test_filepath)
+
+    test_df = test_df.head(permutation_subsample_amount)
+
+    feature_names = test_df.columns.to_list()
+    feature_names.remove(target)
+
+    x = test_df[feature_names].to_numpy()
+    y = test_df[target].to_numpy()
+
+    def predict_fn(X: np.ndarray) -> np.ndarray:
+        return model.predict(pd.DataFrame(X, columns=feature_names))
+
+    pfi = PermutationImportance(
+        predictor=predict_fn,
+        loss_fns=loss_fns,
+        feature_names=feature_names,
+        verbose=True,
+    )
+
+    logger.info(
+        f"Permutation feature importance - using {permutation_subsample_amount} samples and {n_repeats} shuffles per feature:"
+    )
+
+    exp = pfi.explain(x, y, n_repeats=n_repeats)
+
+    mean_value_feature_importance = [
+        element["mean"] for element in exp.data["feature_importance"][0]
+    ]
+    feature_importance_df = pd.DataFrame(
+        mean_value_feature_importance,
+        index=exp.data["feature_names"],
+        columns=[feature_importance_column],
+    ).sort_values(feature_importance_column, ascending=False)
+
+    plot_permutation_importance(
+        exp, fig_kw={"figwidth": figwidth, "figheight": figheight}
+    )
+
+    logger.info("--------------------------------------")
+    logger.info(f"--- Save Feature Importance table ---")
+    logger.info("--------------------------------------")
+
+    dataclient.save_data(feature_importance_df, location=feature_importance_filepath)
+
+
+if __name__ == "__main__":
+
+    logger.info("----------------------------")
+    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")
+
+    model_analysis(
+        model=model,
+        dataclient=dataclient,
+        target=target,
+        output_test_filepath=output_test_filepath,
+        feature_importance_filepath=feature_importance_filepath,
+        permutation_subsample_amount=permutation_subsample_amount,
+        loss_fns=loss_fns,
+        feature_importance_column=feature_importance_column,
+        n_repeats=n_repeats,
+        figwidth=figwidth,
+        figheight=figheight,
+    )
+
+    logger.info("-------------------------------")
+    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/prediction_analysis.py
+++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py
@ -0,0 +1,111 @@
+"""
+Look at why the model made such a prediction
+Manual script to run
+Workflow:
+- Identify a prediction row/s that you wish to look into
+    - i.e. a bad prediction/s
+- Add these rows to the config
+- Run script
+"""
+
+import shap
+
+shap.initjs()
+
+
+import yaml
+from pathlib import Path
+from core.interface.InterfaceModels import MLModel
+from core.interface.InterfaceDataClient import DataClient
+from core.Logger import logger
+from core.MLModels import model_factory
+from core.DataClient import dataclient_factory
+import numpy as np
+import pandas as pd
+
+
+client_path = Path(__file__).parent / "configs" / "client.yaml"
+client_params = yaml.safe_load(open(client_path))
+
+prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
+prepare_data_params = yaml.safe_load(open(prepare_data_path))
+
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
+build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
+build_model_params = yaml.safe_load(open(build_model_path))
+
+prediction_analysis_path = (
+    Path(__file__).parent / "configs" / "prediction_analysis.yaml"
+)
+prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path))
+
+model = model_factory(build_model_params["model_type"])
+model.load_model(build_model_params["model_save_filepath"])
+
+dataclient_type = prediction_analysis_params["dataclient_type"]
+dataclient = dataclient_factory(
+    dataclient_type=dataclient_type,
+    dataclient_config=client_params[dataclient_type],
+)
+
+output_test_filepath = prepare_data_params["output_test_filepath"]
+
+
+def prediction_analysis(
+    model: MLModel, dataclient: DataClient, output_test_filepath: str
+):
+
+    test_df = dataclient.load_data(output_test_filepath)
+    target = "SAP_ENDING"
+    test_df_without_target = test_df.drop(columns=[target])
+
+    # test_df_summary = shap.kmeans(test_df, 10)
+    # print("Baseline feature-values: \n", test_df_summary)
+    class AutogluonWrapper:
+        def __init__(self, predictor, feature_names):
+            self.ag_model = predictor
+            self.feature_names = feature_names
+
+        def predict(self, X):
+            if isinstance(X, pd.Series):
+                X = X.values.reshape(1, -1)
+            if not isinstance(X, pd.DataFrame):
+                X = pd.DataFrame(X, columns=self.feature_names)
+            return self.ag_model.predict(X)
+
+    ag_wrapper = AutogluonWrapper(
+        model.model, feature_names=test_df_without_target.columns
+    )
+    explainer = shap.KernelExplainer(ag_wrapper.predict, test_df_without_target)
+
+    NSHAP_SAMPLES = 100  # how many samples to use to approximate each Shapely value, larger values will be slower
+    N_VAL = 30  # how many datapoints from validation data should we interpret predictions for, larger values will be slower
+
+    ROW_INDEX = 0  # index of an example datapoint
+    single_datapoint = test_df_without_target.iloc[[ROW_INDEX]]
+    single_prediction = ag_wrapper.predict(single_datapoint)
+
+    shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES)
+    shap.force_plot(
+        explainer.expected_value,
+        shap_values_single,
+        test_df_without_target.iloc[ROW_INDEX, :],
+    )
+    ...
+
+
+if __name__ == "__main__":
+
+    logger.info("----------------------------")
+    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")
+
+    prediction_analysis(
+        model=model, dataclient=dataclient, output_test_filepath=output_test_filepath
+    )
+
+    logger.info("-------------------------------")
+    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
--- a/modules/ml-pipeline/src/pipeline/prepare_data.py
+++ b/modules/ml-pipeline/src/pipeline/prepare_data.py
@ -74,6 +74,9 @@ def prepare_data(
        train, test = train_test_split(
            data, train_size=train_proportion, test_size=(1 - train_proportion)
        )
+        test = test.reset_index(drop=True)
+
+    train = train.reset_index(drop=True)

    logger.info("-----------------------")
    logger.info("--- Outputting data ---")
--- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
@ -2,6 +2,7 @@ joblib==1.3.2
 boto3==1.28.17
 pandas==1.5.3
 autogluon==0.8.2
+alibi==0.9.4
 pyarrow==13.0.0
 pre-commit==3.3.3
 sphinx==7.2.5