From 6513e4feb9b64450aca254ed3c806ac57faaf1f8 Mon Sep 17 00:00:00 2001
From: Michael Duong <michael123ster@gmail.com>
Date: Thu, 21 Sep 2023 21:16:48 +0000
Subject: [PATCH 1/3] add feature importance in model analysis script

---
 modules/ml-pipeline/.gitignore                |   1 +
 .../analysis/feature_importance.parquet       | Bin 0 -> 3117 bytes
 .../ml-pipeline/src/pipeline/build_model.py   |  10 +-
 .../src/pipeline/configs/build_model.yaml     |   6 +-
 .../pipeline/configs/feature_processor.yaml   |  58 +++++-
 .../configs/feature_processor_logic.py        |   8 +-
 .../src/pipeline/configs/model_analysis.yaml  |   8 +
 .../src/pipeline/configs/prepare_data.yaml    |   1 -
 .../src/pipeline/core/FeatureProcessor.py     |   4 +-
 modules/ml-pipeline/src/pipeline/dvc.lock     |  66 +++----
 modules/ml-pipeline/src/pipeline/eda.py       | 177 ++++++++++++++++++
 .../src/pipeline/model_analysis.py            | 150 +++++++++++++++
 .../src/pipeline/prediction_analysis.py       |   4 +
 .../training/requirements-dev.txt             |   1 +
 14 files changed, 446 insertions(+), 48 deletions(-)
 create mode 100644 modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
 create mode 100644 modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml
 create mode 100644 modules/ml-pipeline/src/pipeline/eda.py
 create mode 100644 modules/ml-pipeline/src/pipeline/model_analysis.py
 create mode 100644 modules/ml-pipeline/src/pipeline/prediction_analysis.py
diff --git a/modules/ml-pipeline/.gitignore b/modules/ml-pipeline/.gitignore
index 63900db..664bc8d 100644
--- a/modules/ml-pipeline/.gitignore
+++ b/modules/ml-pipeline/.gitignore
@@ -1,4 +1,5 @@
 .dev_env/
+.dev_env_pipeline/
 __pycache__/
 .DS_Store
 .vscode/
diff --git a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b0c328faa639b3a3a210a3f03c23f429954c4719
GIT binary patch
literal 3117
zcmcgveP|o!89zCeD;sNd8lNI3&SGn|YC6T1CCjr?v&*Z~>2&JpPClJJ!#H%($(DPP
zRau`&-R7TlY4<@(O8*FqZH$d{4lFBW9c)WsO9Cqt3Wc$3>&PElX=z8#Z0-I@*FAT#
z<2X(?!k{nsJ-yHKyXW~mU$5jC5@zsT{Kh`~-F+;+fMX9}m}b&Gyj=K||B1Ny!~@sA
zdgU$lW8&#^uZgb@enk9k^zHY2-}(np`_uB7pS-h49C_z``dW39c=(~|mwxEqBz`bK
z-aO~sB&x4!*K5<8#M#+vf!F@25zp3cUUa{!5$nfKnzgqzqEGqb%U6G|5hq{qeX{S0
zM*QsJLpLqo(1`E+?U}0+OB(U#pFa583vrE54_|9?A&sz{yL95C1&y$MFYEfHUn4&G
z(V6k_IgR-H2KL1((;A^PUwFX^&!?tNecJxAMx1}|%^$z&)QH&wXBVGzXhiJ8{3UO{
zMl`O!ML2sk;_~!`qkq{4xWAr#DqzwG9K+}F1s7ys2=BxDM(vjE4A?E(7#lWu7b%8e
zqiPTA3np{_*je<ffAXcP=TEy|XEuI4|Na|~V9smiQG8&;hxHAO*au4G>T0c?tK=2D
z$F^X5*=)OPc0SkZwA_o!5f{`9!+S8;hTwr=aK1Q*f8VlQ!x#H*41ze0Vp3d0g2-^R
z9nLAlvT-R$#YusQi44z46c+^DVbDbp!-Xj%N};0|cJ#<m3(0fbmoS&E3-e+MiIfn@
zVUq{pEDDH=BhuU&6f-f(`UG~wbYYl+7lPsBli1`J01`x*xFqscuiFZzn83#<LCi>|
zM@=8N02mh;F+p);#`M8ph~;_io7jl~00t5a3lw-Eiep&+af^^3&F3c(O-TX71&1(f
z;=qs;N>Hq|jU6zxCxIC+#S^g@o3S2z5V{wmS(K%M=5JeVray>WA{w9s37m6Mh!Ln5
zi%2SN9iJE;;zh(t{TqWwppf<OI5)^Lba?z15zaJ!2+Xj23X($!bVkA-#;A!m!Zwgn
zX_4aMdg4ev!HL#GrgOn4MTQZ_kYFxJaU#Pa^S}BZF=d8R3>V~6oU_ODY5zuVm@<9W
z>N6*Y60FEb6J|^rKi|s>N0QjE>G}SRVJH(53R(SQqu?jRq^aP@Z^rn9$WmflB27Ox
zRgVQ+dV#2*B=CGRZo0Am;3x_(A_}ln{~>f?IL@<3keIWkxp^q66k-v*Rp2}Odk&jC
z)TMW~cCZiZlmW>6kN$po|0q7V)!+LisZ=Q_ZAn#{iYmD!$!PO`TWufrJKwzD`P_Y@
z_u%?&eLk|_!|Z))`DLY?!+SA&V4~;tslxC<5bUKffw6jCZ3`o#Bd*V7B%PmUUTl{R
z*cO}?+bj1EW7vqx-dD|43b{4=*LzPpx&_Ivs*Q4G&2fC}q+?H!9A9&dIl72${S*Uu
z+EK}s6?mo6P*p$_ONv^McFVesq<W{S109V@DZg4!9DuGg%GJ&-#j3ob<m<Y;tkiRb
zTs;Ty(+;Hq@KR+-e>Wk9j?FvP*LTS5X5rYmJas#}*xaGJgH=&o&DG~-_p-W8aT_f?
z+;tbQ+a2m|hkKI<obK>@Z`m;18o}3;Tz$1>BuFjEwOp;E=cn4q)oQCPJ&LAMTPv+r
zbm@$D%01<F0N-uzw=1W7Q(nh9W`}D|;8VxDb7MbanKsrEi4a<ZKYcxl&@2ipp%6k#
zh+Hycfxs?(OBYhN^ud<?KAd40Rt!CZPz%9(@HC7h^aq?_B>*4z3R6}aSvonJ_Ha!(
zoUF^0Xo)WcWJdKDb1C0r+4OQzrjwp*s$GnR{6$7}uQ6m~MfS93<dkn4qvkuz922U8
zJ>RmNO6WEkg;aZuB~d9`R#ll6igGzr&&B~q`(|XirQ2y`z(*#{6$@dtW$4Jrj9ga1
zzMwLt))Ls8k<$TR+AH|-v~CxbWKZPrY>IPt^)qUL4uK6}nORv~QaTY;E_2N+4RPt5
z$Vf6tW$HW`@fVZ3xS!JTDeqe87B4bV1z*j4MOX$KTmG|Ei2bp&XFD!b%B5ynvNz%e
z?km9gv*IYrw12q}%xv-88%MU>Zf4Sp9*#6@)UyynmJYcyw{q0>v!q_5ZQuVTr(~p<
z_H5U(trp73we+b#G2Qu0e*agV<w|l*4$l<y`ewWl74qNAmmuF^$S19q^5sxR(c^#`
zE2&T?U-o;AdR4M&zQR>y&#Y0)s2FWT$=McDF>15Z59MSh?@^nwZj)r%>`I~$?L_=~
z=fYed6b?X*gyp>6=K2{X1(wnOofr(1E?-O-lK{h{AB<h!DE%Y!^4<Dd*TZ~(9Rt6J
z0*eR+g+?fkf;&q66F{N{DAGL`yS^FoN~`re?e9QuHZq=sp#$D9hnX&aU83`bh1tid
ox!U6mrG6}5tsXNjma(miW2}3njJV(%Q~%Oz+y_74;2(wm0Hb2GPXGV_

literal 0
HcmV?d00001

diff --git a/modules/ml-pipeline/src/pipeline/build_model.py b/modules/ml-pipeline/src/pipeline/build_model.py
index a07e9cf..9f88dbd 100644
--- a/modules/ml-pipeline/src/pipeline/build_model.py
+++ b/modules/ml-pipeline/src/pipeline/build_model.py
@@ -68,13 +68,13 @@ def build_model(
         data=train_data, target=target, model_hyperparameters=model_hyperparameters
     )
 
-    logger.info("------------------------------")
-    logger.info("--- Generating predictions ---")
-    logger.info("------------------------------")
+    logger.info("----------------------------------")
+    logger.info("--- Generating fit predictions ---")
+    logger.info("----------------------------------")
 
     prediction_data = train_data.drop(columns=target)
 
-    predictions = model.predict(data=prediction_data)
+    fit_predictions = model.predict(data=prediction_data)
 
     logger.info("------------------------------")
     logger.info("--- Generating fit metrics ---")
@@ -82,7 +82,7 @@ def build_model(
 
     metrics_output = metrics.generate_metrics(
         target=train_data[target],
-        predictions=pd.Series(predictions),
+        predictions=pd.Series(fit_predictions),
     )
 
     logger.info("--------------------")
diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
index 8de60ea..75ae2be 100644
--- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
@@ -1,5 +1,5 @@
-model_type: SKLearnLinearRegression
-model_save_filepath: ./data/model/model.joblib
+model_type: AutogluonAutoML
+model_save_filepath: ./data/model/autogluonmodel/
 fit_metrics_filepath: ./metrics/fit_metrics.json
 
 SKLearnLinearRegression: null
@@ -12,5 +12,5 @@ AutogluonAutoML:
   problem_type: regression
   eval_metric: mean_absolute_error
   time_limit: 400
-  presets: high_quality
+  presets: good_quality
   excluded_model_types: ['KNN']
diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml
index 03c142d..ac75080 100644
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml
@@ -2,7 +2,59 @@ feature_processor_type: dataframe
 feature_processor_config:
   subsample_amount: null
   subsample_seed: 0
-  target: RDSAP_CHANGE
-  drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"]
-  retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
+  target: SAP_ENDING
+  drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE"]
+  # retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
   # retain_features: null
+#   retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
+#  'NUMBER_HEATED_ROOMS',
+#  'FIXED_LIGHTING_OUTLETS_COUNT',
+#  'CONSTRUCTION_AGE_BAND',
+#  'TRANSACTION_TYPE_STARTING',
+#  'LIGHTING_DESCRIPTION_STARTING',
+#  'MAINHEAT_DESCRIPTION_STARTING',
+#  'HOTWATER_DESCRIPTION_STARTING',
+#  'MAIN_FUEL_STARTING',
+#  'MECHANICAL_VENTILATION_STARTING',
+#  'SECONDHEAT_DESCRIPTION_STARTING',
+#  'ENERGY_TARIFF_STARTING',
+#  'SOLAR_WATER_HEATING_FLAG_STARTING',
+#  'PHOTO_SUPPLY_STARTING',
+#  'WINDOWS_DESCRIPTION_STARTING',
+#  'GLAZED_TYPE_STARTING',
+#  'MULTI_GLAZE_PROPORTION_STARTING',
+#  'LOW_ENERGY_LIGHTING_STARTING',
+#  'NUMBER_OPEN_FIREPLACES_STARTING',
+#  'MAINHEATCONT_DESCRIPTION_STARTING',
+#  'EXTENSION_COUNT_STARTING',
+#  'TOTAL_FLOOR_AREA_STARTING',
+#  'FLOOR_HEIGHT_STARTING',
+#  'DAYS_TO_STARTING',
+# 'WALLS_DESCRIPTION_STARTING',
+# 'FLOOR_DESCRIPTION_STARTING']
+  retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
+ 'NUMBER_HEATED_ROOMS',
+ 'FIXED_LIGHTING_OUTLETS_COUNT',
+ 'CONSTRUCTION_AGE_BAND',
+ 'TRANSACTION_TYPE_ENDING',
+ 'LIGHTING_DESCRIPTION_ENDING',
+ 'MAINHEAT_DESCRIPTION_ENDING',
+ 'HOTWATER_DESCRIPTION_ENDING',
+ 'MAIN_FUEL_ENDING',
+ 'MECHANICAL_VENTILATION_ENDING',
+ 'SECONDHEAT_DESCRIPTION_ENDING',
+ 'ENERGY_TARIFF_ENDING',
+ 'SOLAR_WATER_HEATING_FLAG_ENDING',
+ 'PHOTO_SUPPLY_ENDING',
+ 'WINDOWS_DESCRIPTION_ENDING',
+ 'GLAZED_TYPE_ENDING',
+ 'MULTI_GLAZE_PROPORTION_ENDING',
+ 'LOW_ENERGY_LIGHTING_ENDING',
+ 'NUMBER_OPEN_FIREPLACES_ENDING',
+ 'MAINHEATCONT_DESCRIPTION_ENDING',
+ 'EXTENSION_COUNT_ENDING',
+ 'TOTAL_FLOOR_AREA_ENDING',
+ 'FLOOR_HEIGHT_ENDING',
+ 'DAYS_TO_ENDING',
+'WALLS_DESCRIPTION_ENDING',
+'FLOOR_DESCRIPTION_ENDING']
diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
index 4a7d5e1..91a4815 100644
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
@@ -10,4 +10,10 @@ business_logic = {}
 """
 New features dict + function
 """
-new_feature_funcs = {}
+
+
+def SAP_ENDING(df):
+    return df["SAP_STARTING"] + df["RDSAP_CHANGE"]
+
+
+new_feature_funcs = {"SAP_ENDING": SAP_ENDING}
diff --git a/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml b/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml
new file mode 100644
index 0000000..de18ba8
--- /dev/null
+++ b/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml
@@ -0,0 +1,8 @@
+dataclient_type: local
+feature_importance_filepath: ./analysis/feature_importance.parquet
+permutation_subsample_amount: 1000
+loss_fns: "mean_absolute_percentage_error"
+feature_importance_column: importance
+n_repeats: 5
+figwidth: 7
+figheight: 6
diff --git a/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml
index cf99d6a..b7a5670 100644
--- a/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml
@@ -1,6 +1,5 @@
 input_dataclient_type: aws-s3
 output_dataclient_type: local
-datahandler_type: parquet
 data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
 train_proportion: 0.9
 output_train_filepath: ./data/prepared_data/train.parquet
diff --git a/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py
index 03ec4a9..c8c9a4e 100644
--- a/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py
+++ b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py
@@ -134,6 +134,8 @@ class DataFrameFeatureProcessor:
             subsample_amount=feature_processor_config["subsample_amount"],
             subsample_seed=feature_processor_config["subsample_seed"],
         )
+        df = self.apply_business_logic(df, business_logic=business_logic)
+        df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
         df = self.drop_unused_columns(
             df, drop_columns=feature_processor_config["drop_columns"]
         )
@@ -142,6 +144,4 @@ class DataFrameFeatureProcessor:
             retain_features=feature_processor_config["retain_features"],
             target=feature_processor_config["target"],
         )
-        df = self.apply_business_logic(df, business_logic=business_logic)
-        df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
         return df
diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock
index 540ad8c..b1567bf 100644
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@@ -5,8 +5,8 @@ stages:
     deps:
     - path: prepare_data.py
       hash: md5
-      md5: 7531a931a405650dc4e8b5d8c1fd3c66
-      size: 4959
+      md5: 934d774e67f38e440b621ce71152f5f6
+      size: 5031
     params:
       configs/prepare_data.yaml:
         output_test_filepath: ./data/prepared_data/test.parquet
@@ -15,20 +15,20 @@ stages:
     outs:
     - path: data/prepared_data/
       hash: md5
-      md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
-      size: 13238511
+      md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
+      size: 13429347
       nfiles: 2
   build_model:
     cmd: python build_model.py
     deps:
     - path: build_model.py
       hash: md5
-      md5: c07ce0b8fdaf337ddfb7115684932157
-      size: 5048
+      md5: f9fa2a66d908b42ae196ce6f0f782258
+      size: 5134
     - path: data/prepared_data
       hash: md5
-      md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
-      size: 13238511
+      md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
+      size: 13429347
       nfiles: 2
     params:
       configs/build_model.yaml:
@@ -37,42 +37,42 @@ stages:
           problem_type: regression
           eval_metric: mean_absolute_error
           time_limit: 400
-          presets: high_quality
+          presets: good_quality
           excluded_model_types:
           - KNN
         SKLearnLinearRegression:
         SKLearnSVMRegression:
           kernel: linear
         fit_metrics_filepath: ./metrics/fit_metrics.json
-        model_save_filepath: ./data/model/model.joblib
-        model_type: SKLearnLinearRegression
+        model_save_filepath: ./data/model/autogluonmodel/
+        model_type: AutogluonAutoML
     outs:
     - path: data/model/
       hash: md5
-      md5: 2ace0835c28543512982b69d383b3c49.dir
-      size: 1832
-      nfiles: 1
+      md5: 10c467d6fe4ef8151d2df1e10fdf674f.dir
+      size: 118580145
+      nfiles: 71
     - path: metrics/fit_metrics.json
       hash: md5
-      md5: c8c5a40863e2ced7f5f5a844ba203d80
-      size: 180
+      md5: d4afc981e1e0783b79b02b0ba54638c4
+      size: 185
   generate_predictions:
     cmd: python generate_predictions.py
     deps:
     - path: data/model
       hash: md5
-      md5: 2ace0835c28543512982b69d383b3c49.dir
-      size: 1832
-      nfiles: 1
+      md5: 10c467d6fe4ef8151d2df1e10fdf674f.dir
+      size: 118580145
+      nfiles: 71
     - path: data/prepared_data
       hash: md5
-      md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
-      size: 13238511
+      md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
+      size: 13429347
       nfiles: 2
     - path: generate_predictions.py
       hash: md5
-      md5: ab603e9a526a73f2fe17603e6fe6c0a4
-      size: 4261
+      md5: a25c4611ff467cdc1c921918112a30fe
+      size: 4311
     params:
       configs/generate_predictions.yaml:
         input_dataclient_type: local
@@ -83,26 +83,26 @@ stages:
     outs:
     - path: data/predictions/
       hash: md5
-      md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
-      size: 643838
+      md5: 4acd58ff6aae8faedc0b0bb848aedc97.dir
+      size: 537020
       nfiles: 1
   generate_metrics:
     cmd: python generate_metrics.py
     deps:
     - path: data/predictions
       hash: md5
-      md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
-      size: 643838
+      md5: 4acd58ff6aae8faedc0b0bb848aedc97.dir
+      size: 537020
       nfiles: 1
     - path: data/prepared_data
       hash: md5
-      md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
-      size: 13238511
+      md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
+      size: 13429347
       nfiles: 2
     - path: generate_metrics.py
       hash: md5
-      md5: 78a9b9b25d0a7deaf44277f9afad5f98
-      size: 4139
+      md5: 8ce0b6b55e1688fca816985e0cf37f28
+      size: 4220
     params:
       configs/generate_metrics.yaml:
         dataclient_type: local
@@ -113,8 +113,8 @@ stages:
     outs:
     - path: metrics/metrics.json
       hash: md5
-      md5: f494881710a057f90f82c0bd3a40a41d
-      size: 183
+      md5: f75356e08ceabb102d5b23508e140f0a
+      size: 182
   startup_cleanup:
     cmd: python startup_cleanup.py
     deps:
diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py
new file mode 100644
index 0000000..1260d09
--- /dev/null
+++ b/modules/ml-pipeline/src/pipeline/eda.py
@@ -0,0 +1,177 @@
+"""
+Doing some eda on dataset
+"""
+# Look at response variable
+
+from matplotlib import pyplot as plt
+import pandas as pd
+
+train_df = pd.read_parquet("./data/prepared_data/train.parquet")
+target = "SAP_ENDING"
+
+train_df = train_df.head(10000)
+
+# train_df[target].plot(kind='hist')
+
+# Plot the target variable
+fig, ax = plt.subplots(figsize=(10, 7))
+ax.hist(train_df[target], bins=range(min(train_df[target]), max(train_df[target])))
+
+fig
+
+# Find correlation to sale price (numeric)
+train_df.dtypes
+# All numerical
+
+train_df_corr = train_df.corr()
+
+train_df_corr.style.background_gradient(cmap="coolwarm")
+
+train_df_corr["EXTENSION_COUNT_ENDING"]
+
+# Check out some correlation plots between variables
+# sap starting - negative correlation
+
+train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
+
+# head demand - light positive correlation
+train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
+    x=target, y="HEAT_DEMAND_STARTING", style="o"
+)
+
+# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
+
+# Load the autogluon model and check feature importance
+
+
+import os
+import yaml
+import pandas as pd
+from pathlib import Path
+from core.interface.InterfaceModels import MLModel
+from core.interface.InterfaceDataClient import DataClient
+from core.DataClient import dataclient_factory
+from core.MLModels import model_factory
+from core.Logger import logger
+
+
+RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
+
+client_path = Path(__file__).parent / "configs" / "client.yaml"
+client_params = yaml.safe_load(open(client_path))
+
+prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
+prepare_data_params = yaml.safe_load(open(prepare_data_path))
+
+build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
+build_model_params = yaml.safe_load(open(build_model_path))
+
+generate_predictions_path = (
+    Path(__file__).parent / "configs" / "generate_predictions.yaml"
+)
+generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
+
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
+model = model_factory(build_model_params["model_type"])
+model_filepath = build_model_params["model_save_filepath"]
+
+model.load_model(model_filepath)
+
+fi = model.model.feature_importance(train_df.reset_index(drop=True))
+
+pred = pd.read_parquet("./data/predictions/predictions.parquet")
+test_df = pd.read_parquet("./data/prepared_data/test.parquet")
+
+# test_df = test_df.head(1000)
+
+test_df["predictions"] = pred["predictions"]
+
+test_df.groupby("PROPERTY_TYPE").apply(
+    lambda x: (x.SAP_ENDING - x.predictions).abs().mean()
+)
+
+test_df.head()
+flat_df = test_df[test_df["PROPERTY_TYPE"] == "Flat"]
+
+flat_df["residual"] = abs(flat_df["predictions"] - flat_df[target])
+
+generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
+generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
+from core.MLMetrics import metrics_factory
+
+metrics = metrics_factory(generate_metrics_params["metrics_type"])
+
+metrics_output = metrics.generate_metrics(
+    target=flat_df[target],
+    predictions=pd.Series(flat_df["predictions"]),
+)
+
+# Use alibi to run permutation importance
+
+from alibi.explainers import PermutationImportance, plot_permutation_importance
+from sklearn.metrics import mean_absolute_percentage_error
+import numpy as np
+import pandas as pd
+
+test_df = pd.read_parquet("./data/prepared_data/test.parquet")
+test_df = test_df.head(1000)
+
+target = "SAP_ENDING"
+feature_names = test_df.columns.to_list()
+feature_names.remove(target)
+
+x = test_df[feature_names].to_numpy()
+y = test_df[target].to_numpy()
+
+
+def predict_fn(X: np.ndarray) -> np.ndarray:
+    return model.predict(pd.DataFrame(X, columns=feature_names))
+
+
+pfi = PermutationImportance(
+    predictor=predict_fn,
+    loss_fns=mean_absolute_percentage_error,
+    feature_names=feature_names,
+    verbose=True,
+)
+
+exp = pfi.explain(x, y)
+plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
+
+[
+    "PROPERTY_TYPE",
+    "BUILT_FORM",
+    "CONSTITUENCY",
+    "NUMBER_HABITABLE_ROOMS",
+    "NUMBER_HEATED_ROOMS",
+    "FIXED_LIGHTING_OUTLETS_COUNT",
+    "CONSTRUCTION_AGE_BAND",
+    "TRANSACTION_TYPE_STARTING",
+    "LIGHTING_DESCRIPTION_STARTING",
+    "MAINHEAT_DESCRIPTION_STARTING",
+    "HOTWATER_DESCRIPTION_STARTING",
+    "MAIN_FUEL_STARTING",
+    "MECHANICAL_VENTILATION_STARTING",
+    "SECONDHEAT_DESCRIPTION_STARTING",
+    "ENERGY_TARIFF_STARTING",
+    "SOLAR_WATER_HEATING_FLAG_STARTING",
+    "PHOTO_SUPPLY_STARTING",
+    "WINDOWS_DESCRIPTION_STARTING",
+    "GLAZED_TYPE_STARTING",
+    "MULTI_GLAZE_PROPORTION_STARTING",
+    "LOW_ENERGY_LIGHTING_STARTING",
+    "NUMBER_OPEN_FIREPLACES_STARTING",
+    "MAINHEATCONT_DESCRIPTION_STARTING",
+    "EXTENSION_COUNT_STARTING",
+    "TOTAL_FLOOR_AREA_STARTING",
+    "FLOOR_HEIGHT_STARTING",
+    "DAYS_TO_STARTING",
+    "WALLS_DESCRIPTION_STARTING",
+    "FLOOR_DESCRIPTION_STARTING",
+]
+
+# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
+#
+#
diff --git a/modules/ml-pipeline/src/pipeline/model_analysis.py b/modules/ml-pipeline/src/pipeline/model_analysis.py
new file mode 100644
index 0000000..206eb9a
--- /dev/null
+++ b/modules/ml-pipeline/src/pipeline/model_analysis.py
@@ -0,0 +1,150 @@
+"""
+Post Model generation step:
+We want to look at feature analysis of the model
+"""
+
+import yaml
+from pathlib import Path
+from core.interface.InterfaceModels import MLModel
+from core.interface.InterfaceDataClient import DataClient
+from core.Logger import logger
+from core.MLModels import model_factory
+from core.DataClient import dataclient_factory
+from alibi.explainers import PermutationImportance, plot_permutation_importance
+import numpy as np
+import pandas as pd
+
+
+client_path = Path(__file__).parent / "configs" / "client.yaml"
+client_params = yaml.safe_load(open(client_path))
+
+prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
+prepare_data_params = yaml.safe_load(open(prepare_data_path))
+
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
+build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
+build_model_params = yaml.safe_load(open(build_model_path))
+
+model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
+model_analysis_params = yaml.safe_load(open(model_analysis_path))
+
+generate_predictions_path = (
+    Path(__file__).parent / "configs" / "generate_predictions.yaml"
+)
+generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
+
+model = model_factory(build_model_params["model_type"])
+model.load_model(build_model_params["model_save_filepath"])
+
+dataclient_type = model_analysis_params["dataclient_type"]
+dataclient = dataclient_factory(
+    dataclient_type=dataclient_type,
+    dataclient_config=client_params[dataclient_type],
+)
+
+
+feature_importance_filepath = model_analysis_params["feature_importance_filepath"]
+permutation_subsample_amount = model_analysis_params["permutation_subsample_amount"]
+loss_fns = model_analysis_params["loss_fns"]
+feature_importance_column = model_analysis_params["feature_importance_column"]
+n_repeats = model_analysis_params["n_repeats"]
+figwidth = model_analysis_params["figwidth"]
+figheight = model_analysis_params["figheight"]
+target = feature_process_params["feature_processor_config"]["target"]
+output_test_filepath = prepare_data_params["output_test_filepath"]
+
+
+def model_analysis(
+    model: MLModel,
+    dataclient: DataClient,
+    target: str,
+    output_test_filepath: str,
+    feature_importance_filepath: str,
+    permutation_subsample_amount: int = 100,
+    loss_fns: str = "mean_absolute_percentage_error",
+    feature_importance_column: str = "importance",
+    n_repeats: int = 5,
+    figwidth: int = 7,
+    figheight: int = 6,
+):
+    """
+    Key task is to take in a model and generate:
+    - feature importance
+    and save these outputs
+    """
+
+    logger.info("------------------------------------")
+    logger.info(f"--- Generate Feature Importance ---")
+    logger.info("------------------------------------")
+
+    test_df = pd.read_parquet(output_test_filepath)
+
+    test_df = test_df.head(permutation_subsample_amount)
+
+    feature_names = test_df.columns.to_list()
+    feature_names.remove(target)
+
+    x = test_df[feature_names].to_numpy()
+    y = test_df[target].to_numpy()
+
+    def predict_fn(X: np.ndarray) -> np.ndarray:
+        return model.predict(pd.DataFrame(X, columns=feature_names))
+
+    pfi = PermutationImportance(
+        predictor=predict_fn,
+        loss_fns=loss_fns,
+        feature_names=feature_names,
+        verbose=True,
+    )
+
+    logger.info(
+        f"Permutation feature importance - using {permutation_subsample_amount} samples and {n_repeats} shuffles per feature:"
+    )
+
+    exp = pfi.explain(x, y, n_repeats=n_repeats)
+
+    mean_value_feature_importance = [
+        element["mean"] for element in exp.data["feature_importance"][0]
+    ]
+    feature_importance_df = pd.DataFrame(
+        mean_value_feature_importance,
+        index=exp.data["feature_names"],
+        columns=[feature_importance_column],
+    ).sort_values(feature_importance_column, ascending=False)
+
+    plot_permutation_importance(
+        exp, fig_kw={"figwidth": figwidth, "figheight": figheight}
+    )
+
+    logger.info("--------------------------------------")
+    logger.info(f"--- Save Feature Importance table ---")
+    logger.info("--------------------------------------")
+
+    dataclient.save_data(feature_importance_df, location=feature_importance_filepath)
+
+
+if __name__ == "__main__":
+
+    logger.info("----------------------------")
+    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")
+
+    model_analysis(
+        model=model,
+        dataclient=dataclient,
+        target=target,
+        output_test_filepath=output_test_filepath,
+        feature_importance_filepath=feature_importance_filepath,
+        permutation_subsample_amount=permutation_subsample_amount,
+        loss_fns=loss_fns,
+        feature_importance_column=feature_importance_column,
+        n_repeats=n_repeats,
+        figwidth=figwidth,
+        figheight=figheight,
+    )
+
+    logger.info("-------------------------------")
+    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py
new file mode 100644
index 0000000..428bf0b
--- /dev/null
+++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py
@@ -0,0 +1,4 @@
+"""
+Look at why the model made such a prediction
+Manual script to run
+"""
diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
index b4679d0..e34d5af 100644
--- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
+++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt
@@ -2,6 +2,7 @@ joblib==1.3.2
 boto3==1.28.17
 pandas==1.5.3
 autogluon==0.8.2
+alibi==0.9.4
 pyarrow==13.0.0
 pre-commit==3.3.3
 sphinx==7.2.5

From 4a6b7f3ed770094704f2f58bd0bd5a00b7720f83 Mon Sep 17 00:00:00 2001
From: Michael Duong <michael123ster@gmail.com>
Date: Thu, 21 Sep 2023 21:28:14 +0000
Subject: [PATCH 2/3] fixed bug

---
 .pre-commit-config.yaml                       |  2 +-
 modules/ml-pipeline/src/pipeline/dvc.lock     | 38 ++++++------
 .../src/pipeline/prediction_analysis.py       | 61 +++++++++++++++++++
 3 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d59b9e8..196008f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,6 +14,6 @@ repos:
     hooks:
     -   id: dvc-push-experiment
         name: DVC - Push to experiment to remote location (experiments)
-        entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"'
+        entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"'
         language: system
         verbose: true
diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock
index b1567bf..501dc10 100644
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@@ -15,8 +15,8 @@ stages:
     outs:
     - path: data/prepared_data/
       hash: md5
-      md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
-      size: 13429347
+      md5: 3767eec56906f5ac724a3f07433645ef.dir
+      size: 13442342
       nfiles: 2
   build_model:
     cmd: python build_model.py
@@ -27,8 +27,8 @@ stages:
       size: 5134
     - path: data/prepared_data
       hash: md5
-      md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
-      size: 13429347
+      md5: 3767eec56906f5ac724a3f07433645ef.dir
+      size: 13442342
       nfiles: 2
     params:
       configs/build_model.yaml:
@@ -49,25 +49,25 @@ stages:
     outs:
     - path: data/model/
       hash: md5
-      md5: 10c467d6fe4ef8151d2df1e10fdf674f.dir
-      size: 118580145
+      md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir
+      size: 118227750
       nfiles: 71
     - path: metrics/fit_metrics.json
       hash: md5
-      md5: d4afc981e1e0783b79b02b0ba54638c4
+      md5: e1c9a16617804f48e8ffac7cec6575ca
       size: 185
   generate_predictions:
     cmd: python generate_predictions.py
     deps:
     - path: data/model
       hash: md5
-      md5: 10c467d6fe4ef8151d2df1e10fdf674f.dir
-      size: 118580145
+      md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir
+      size: 118227750
       nfiles: 71
     - path: data/prepared_data
       hash: md5
-      md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
-      size: 13429347
+      md5: 3767eec56906f5ac724a3f07433645ef.dir
+      size: 13442342
       nfiles: 2
     - path: generate_predictions.py
       hash: md5
@@ -83,21 +83,21 @@ stages:
     outs:
     - path: data/predictions/
       hash: md5
-      md5: 4acd58ff6aae8faedc0b0bb848aedc97.dir
-      size: 537020
+      md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir
+      size: 536774
       nfiles: 1
   generate_metrics:
     cmd: python generate_metrics.py
     deps:
     - path: data/predictions
       hash: md5
-      md5: 4acd58ff6aae8faedc0b0bb848aedc97.dir
-      size: 537020
+      md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir
+      size: 536774
       nfiles: 1
     - path: data/prepared_data
       hash: md5
-      md5: ba1f48d2a8ebb63c8b2406632cc516a0.dir
-      size: 13429347
+      md5: 3767eec56906f5ac724a3f07433645ef.dir
+      size: 13442342
       nfiles: 2
     - path: generate_metrics.py
       hash: md5
@@ -113,8 +113,8 @@ stages:
     outs:
     - path: metrics/metrics.json
       hash: md5
-      md5: f75356e08ceabb102d5b23508e140f0a
-      size: 182
+      md5: 852ef4cf2ca5e7f89d70420a9df7a596
+      size: 183
   startup_cleanup:
     cmd: python startup_cleanup.py
     deps:
diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py
index 428bf0b..344b602 100644
--- a/modules/ml-pipeline/src/pipeline/prediction_analysis.py
+++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py
@@ -1,4 +1,65 @@
 """
 Look at why the model made such a prediction
 Manual script to run
+Workflow:
+- Identify a prediction row/s that you wish to look into
+    - i.e. a bad prediction/s
+- Add these rows to the config
+- Run script
 """
+
+import shap
+
+shap.initjs()
+
+
+import yaml
+from pathlib import Path
+from core.interface.InterfaceModels import MLModel
+from core.interface.InterfaceDataClient import DataClient
+from core.Logger import logger
+from core.MLModels import model_factory
+from core.DataClient import dataclient_factory
+import numpy as np
+import pandas as pd
+
+
+client_path = Path(__file__).parent / "configs" / "client.yaml"
+client_params = yaml.safe_load(open(client_path))
+
+prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
+prepare_data_params = yaml.safe_load(open(prepare_data_path))
+
+feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
+feature_process_params = yaml.safe_load(open(feature_process_path))
+
+build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
+build_model_params = yaml.safe_load(open(build_model_path))
+
+model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
+model_analysis_params = yaml.safe_load(open(model_analysis_path))
+
+generate_predictions_path = (
+    Path(__file__).parent / "configs" / "generate_predictions.yaml"
+)
+generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
+
+model = model_factory(build_model_params["model_type"])
+model.load_model(build_model_params["model_save_filepath"])
+
+dataclient_type = model_analysis_params["dataclient_type"]
+dataclient = dataclient_factory(
+    dataclient_type=dataclient_type,
+    dataclient_config=client_params[dataclient_type],
+)
+
+
+def prediction_analysis(model: MLModel, dataclient: DataClient):
+
+    shap.kmeans()
+
+    ...
+
+
+if __name__ == "__main__":
+    prediction_analysis()

From 1e70a3a582c2ebf3302df261a616aa636ddf3d57 Mon Sep 17 00:00:00 2001
From: Michael Duong <michael123ster@gmail.com>
Date: Thu, 21 Sep 2023 21:58:55 +0000
Subject: [PATCH 3/3] nearly there

---
 .../pipeline/configs/generate_metrics.yaml    |  2 -
 .../src/pipeline/model_analysis.py            |  2 +-
 .../src/pipeline/prediction_analysis.py       | 66 ++++++++++++++++---
 .../ml-pipeline/src/pipeline/prepare_data.py  |  3 +
 4 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml
index 84f5897..7ed9819 100644
--- a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml
@@ -1,5 +1,3 @@
 dataclient_type: local
-input_datahandler_type: parquet
-output_datahandler_type: json
 metrics_type: Regression
 metrics_output_filepath: ./metrics/metrics.json
diff --git a/modules/ml-pipeline/src/pipeline/model_analysis.py b/modules/ml-pipeline/src/pipeline/model_analysis.py
index 206eb9a..fb1f23c 100644
--- a/modules/ml-pipeline/src/pipeline/model_analysis.py
+++ b/modules/ml-pipeline/src/pipeline/model_analysis.py
@@ -79,7 +79,7 @@ def model_analysis(
     logger.info(f"--- Generate Feature Importance ---")
     logger.info("------------------------------------")
 
-    test_df = pd.read_parquet(output_test_filepath)
+    test_df = dataclient.load_data(output_test_filepath)
 
     test_df = test_df.head(permutation_subsample_amount)
 
diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py
index 344b602..c65684f 100644
--- a/modules/ml-pipeline/src/pipeline/prediction_analysis.py
+++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py
@@ -36,30 +36,76 @@ feature_process_params = yaml.safe_load(open(feature_process_path))
 build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
 build_model_params = yaml.safe_load(open(build_model_path))
 
-model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
-model_analysis_params = yaml.safe_load(open(model_analysis_path))
-
-generate_predictions_path = (
-    Path(__file__).parent / "configs" / "generate_predictions.yaml"
+prediction_analysis_path = (
+    Path(__file__).parent / "configs" / "prediction_analysis.yaml"
 )
-generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
+prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path))
 
 model = model_factory(build_model_params["model_type"])
 model.load_model(build_model_params["model_save_filepath"])
 
-dataclient_type = model_analysis_params["dataclient_type"]
+dataclient_type = prediction_analysis_params["dataclient_type"]
 dataclient = dataclient_factory(
     dataclient_type=dataclient_type,
     dataclient_config=client_params[dataclient_type],
 )
 
+output_test_filepath = prepare_data_params["output_test_filepath"]
 
-def prediction_analysis(model: MLModel, dataclient: DataClient):
 
-    shap.kmeans()
+def prediction_analysis(
+    model: MLModel, dataclient: DataClient, output_test_filepath: str
+):
 
+    test_df = dataclient.load_data(output_test_filepath)
+    target = "SAP_ENDING"
+    test_df_without_target = test_df.drop(columns=[target])
+
+    # test_df_summary = shap.kmeans(test_df, 10)
+    # print("Baseline feature-values: \n", test_df_summary)
+    class AutogluonWrapper:
+        def __init__(self, predictor, feature_names):
+            self.ag_model = predictor
+            self.feature_names = feature_names
+
+        def predict(self, X):
+            if isinstance(X, pd.Series):
+                X = X.values.reshape(1, -1)
+            if not isinstance(X, pd.DataFrame):
+                X = pd.DataFrame(X, columns=self.feature_names)
+            return self.ag_model.predict(X)
+
+    ag_wrapper = AutogluonWrapper(
+        model.model, feature_names=test_df_without_target.columns
+    )
+    explainer = shap.KernelExplainer(ag_wrapper.predict, test_df_without_target)
+
+    NSHAP_SAMPLES = 100  # how many samples to use to approximate each Shapely value, larger values will be slower
+    N_VAL = 30  # how many datapoints from validation data should we interpret predictions for, larger values will be slower
+
+    ROW_INDEX = 0  # index of an example datapoint
+    single_datapoint = test_df_without_target.iloc[[ROW_INDEX]]
+    single_prediction = ag_wrapper.predict(single_datapoint)
+
+    shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES)
+    shap.force_plot(
+        explainer.expected_value,
+        shap_values_single,
+        test_df_without_target.iloc[ROW_INDEX, :],
+    )
     ...
 
 
 if __name__ == "__main__":
-    prediction_analysis()
+
+    logger.info("----------------------------")
+    logger.info(f"--- {__file__} - Start! ---")
+    logger.info("----------------------------")
+
+    prediction_analysis(
+        model=model, dataclient=dataclient, output_test_filepath=output_test_filepath
+    )
+
+    logger.info("-------------------------------")
+    logger.info(f"--- {__file__} - Complete! ---")
+    logger.info("-------------------------------")
diff --git a/modules/ml-pipeline/src/pipeline/prepare_data.py b/modules/ml-pipeline/src/pipeline/prepare_data.py
index 8caa101..f7bdbd1 100644
--- a/modules/ml-pipeline/src/pipeline/prepare_data.py
+++ b/modules/ml-pipeline/src/pipeline/prepare_data.py
@@ -74,6 +74,9 @@ def prepare_data(
         train, test = train_test_split(
             data, train_size=train_proportion, test_size=(1 - train_proportion)
         )
+        test = test.reset_index(drop=True)
+
+    train = train.reset_index(drop=True)
 
     logger.info("-----------------------")
     logger.info("--- Outputting data ---")