diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d59b9e8..196008f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,6 +14,6 @@ repos: hooks: - id: dvc-push-experiment name: DVC - Push to experiment to remote location (experiments) - entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"' + entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"' language: system verbose: true diff --git a/modules/ml-pipeline/.gitignore b/modules/ml-pipeline/.gitignore index 63900db..664bc8d 100644 --- a/modules/ml-pipeline/.gitignore +++ b/modules/ml-pipeline/.gitignore @@ -1,4 +1,5 @@ .dev_env/ +.dev_env_pipeline/ __pycache__/ .DS_Store .vscode/ diff --git a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet new file mode 100644 index 0000000..b0c328f Binary files /dev/null and b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet differ diff --git a/modules/ml-pipeline/src/pipeline/build_model.py b/modules/ml-pipeline/src/pipeline/build_model.py index a07e9cf..9f88dbd 100644 --- a/modules/ml-pipeline/src/pipeline/build_model.py +++ b/modules/ml-pipeline/src/pipeline/build_model.py @@ -68,13 +68,13 @@ def build_model( data=train_data, target=target, model_hyperparameters=model_hyperparameters ) - logger.info("------------------------------") - logger.info("--- Generating predictions ---") - logger.info("------------------------------") + logger.info("----------------------------------") + logger.info("--- Generating fit predictions ---") + logger.info("----------------------------------") prediction_data = train_data.drop(columns=target) - predictions = model.predict(data=prediction_data) + fit_predictions = model.predict(data=prediction_data) logger.info("------------------------------") logger.info("--- Generating fit metrics ---") @@ -82,7 +82,7 @@ def build_model( metrics_output = metrics.generate_metrics( target=train_data[target], - predictions=pd.Series(predictions), + predictions=pd.Series(fit_predictions), ) logger.info("--------------------") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 8de60ea..75ae2be 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -1,5 +1,5 @@ -model_type: SKLearnLinearRegression -model_save_filepath: ./data/model/model.joblib +model_type: AutogluonAutoML +model_save_filepath: ./data/model/autogluonmodel/ fit_metrics_filepath: ./metrics/fit_metrics.json SKLearnLinearRegression: null @@ -12,5 +12,5 @@ AutogluonAutoML: problem_type: regression eval_metric: mean_absolute_error time_limit: 400 - presets: high_quality + presets: good_quality excluded_model_types: ['KNN'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml index 03c142d..ac75080 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml @@ -2,7 +2,59 @@ feature_processor_type: dataframe feature_processor_config: subsample_amount: null subsample_seed: 0 - target: RDSAP_CHANGE - drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"] - retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"] + target: SAP_ENDING + drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE"] + # retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"] # retain_features: null +# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS', +# 'NUMBER_HEATED_ROOMS', +# 'FIXED_LIGHTING_OUTLETS_COUNT', +# 'CONSTRUCTION_AGE_BAND', +# 'TRANSACTION_TYPE_STARTING', +# 'LIGHTING_DESCRIPTION_STARTING', +# 'MAINHEAT_DESCRIPTION_STARTING', +# 'HOTWATER_DESCRIPTION_STARTING', +# 'MAIN_FUEL_STARTING', +# 'MECHANICAL_VENTILATION_STARTING', +# 'SECONDHEAT_DESCRIPTION_STARTING', +# 'ENERGY_TARIFF_STARTING', +# 'SOLAR_WATER_HEATING_FLAG_STARTING', +# 'PHOTO_SUPPLY_STARTING', +# 'WINDOWS_DESCRIPTION_STARTING', +# 'GLAZED_TYPE_STARTING', +# 'MULTI_GLAZE_PROPORTION_STARTING', +# 'LOW_ENERGY_LIGHTING_STARTING', +# 'NUMBER_OPEN_FIREPLACES_STARTING', +# 'MAINHEATCONT_DESCRIPTION_STARTING', +# 'EXTENSION_COUNT_STARTING', +# 'TOTAL_FLOOR_AREA_STARTING', +# 'FLOOR_HEIGHT_STARTING', +# 'DAYS_TO_STARTING', +# 'WALLS_DESCRIPTION_STARTING', +# 'FLOOR_DESCRIPTION_STARTING'] + retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS', + 'NUMBER_HEATED_ROOMS', + 'FIXED_LIGHTING_OUTLETS_COUNT', + 'CONSTRUCTION_AGE_BAND', + 'TRANSACTION_TYPE_ENDING', + 'LIGHTING_DESCRIPTION_ENDING', + 'MAINHEAT_DESCRIPTION_ENDING', + 'HOTWATER_DESCRIPTION_ENDING', + 'MAIN_FUEL_ENDING', + 'MECHANICAL_VENTILATION_ENDING', + 'SECONDHEAT_DESCRIPTION_ENDING', + 'ENERGY_TARIFF_ENDING', + 'SOLAR_WATER_HEATING_FLAG_ENDING', + 'PHOTO_SUPPLY_ENDING', + 'WINDOWS_DESCRIPTION_ENDING', + 'GLAZED_TYPE_ENDING', + 'MULTI_GLAZE_PROPORTION_ENDING', + 'LOW_ENERGY_LIGHTING_ENDING', + 'NUMBER_OPEN_FIREPLACES_ENDING', + 'MAINHEATCONT_DESCRIPTION_ENDING', + 'EXTENSION_COUNT_ENDING', + 'TOTAL_FLOOR_AREA_ENDING', + 'FLOOR_HEIGHT_ENDING', + 'DAYS_TO_ENDING', +'WALLS_DESCRIPTION_ENDING', +'FLOOR_DESCRIPTION_ENDING'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 4a7d5e1..91a4815 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -10,4 +10,10 @@ business_logic = {} """ New features dict + function """ -new_feature_funcs = {} + + +def SAP_ENDING(df): + return df["SAP_STARTING"] + df["RDSAP_CHANGE"] + + +new_feature_funcs = {"SAP_ENDING": SAP_ENDING} diff --git a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml index 84f5897..7ed9819 100644 --- a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml @@ -1,5 +1,3 @@ dataclient_type: local -input_datahandler_type: parquet -output_datahandler_type: json metrics_type: Regression metrics_output_filepath: ./metrics/metrics.json diff --git a/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml b/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml new file mode 100644 index 0000000..de18ba8 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml @@ -0,0 +1,8 @@ +dataclient_type: local +feature_importance_filepath: ./analysis/feature_importance.parquet +permutation_subsample_amount: 1000 +loss_fns: "mean_absolute_percentage_error" +feature_importance_column: importance +n_repeats: 5 +figwidth: 7 +figheight: 6 diff --git a/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml index cf99d6a..b7a5670 100644 --- a/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml @@ -1,6 +1,5 @@ input_dataclient_type: aws-s3 output_dataclient_type: local -datahandler_type: parquet data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet diff --git a/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py index 03ec4a9..c8c9a4e 100644 --- a/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py +++ b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py @@ -134,6 +134,8 @@ class DataFrameFeatureProcessor: subsample_amount=feature_processor_config["subsample_amount"], subsample_seed=feature_processor_config["subsample_seed"], ) + df = self.apply_business_logic(df, business_logic=business_logic) + df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs) df = self.drop_unused_columns( df, drop_columns=feature_processor_config["drop_columns"] ) @@ -142,6 +144,4 @@ class DataFrameFeatureProcessor: retain_features=feature_processor_config["retain_features"], target=feature_processor_config["target"], ) - df = self.apply_business_logic(df, business_logic=business_logic) - df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs) return df diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 540ad8c..501dc10 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: prepare_data.py hash: md5 - md5: 7531a931a405650dc4e8b5d8c1fd3c66 - size: 4959 + md5: 934d774e67f38e440b621ce71152f5f6 + size: 5031 params: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet @@ -15,20 +15,20 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir - size: 13238511 + md5: 3767eec56906f5ac724a3f07433645ef.dir + size: 13442342 nfiles: 2 build_model: cmd: python build_model.py deps: - path: build_model.py hash: md5 - md5: c07ce0b8fdaf337ddfb7115684932157 - size: 5048 + md5: f9fa2a66d908b42ae196ce6f0f782258 + size: 5134 - path: data/prepared_data hash: md5 - md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir - size: 13238511 + md5: 3767eec56906f5ac724a3f07433645ef.dir + size: 13442342 nfiles: 2 params: configs/build_model.yaml: @@ -37,42 +37,42 @@ stages: problem_type: regression eval_metric: mean_absolute_error time_limit: 400 - presets: high_quality + presets: good_quality excluded_model_types: - KNN SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear fit_metrics_filepath: ./metrics/fit_metrics.json - model_save_filepath: ./data/model/model.joblib - model_type: SKLearnLinearRegression + model_save_filepath: ./data/model/autogluonmodel/ + model_type: AutogluonAutoML outs: - path: data/model/ hash: md5 - md5: 2ace0835c28543512982b69d383b3c49.dir - size: 1832 - nfiles: 1 + md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir + size: 118227750 + nfiles: 71 - path: metrics/fit_metrics.json hash: md5 - md5: c8c5a40863e2ced7f5f5a844ba203d80 - size: 180 + md5: e1c9a16617804f48e8ffac7cec6575ca + size: 185 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 2ace0835c28543512982b69d383b3c49.dir - size: 1832 - nfiles: 1 + md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir + size: 118227750 + nfiles: 71 - path: data/prepared_data hash: md5 - md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir - size: 13238511 + md5: 3767eec56906f5ac724a3f07433645ef.dir + size: 13442342 nfiles: 2 - path: generate_predictions.py hash: md5 - md5: ab603e9a526a73f2fe17603e6fe6c0a4 - size: 4261 + md5: a25c4611ff467cdc1c921918112a30fe + size: 4311 params: configs/generate_predictions.yaml: input_dataclient_type: local @@ -83,26 +83,26 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: e87d96ed77d01ab2f24aeab5aaafe344.dir - size: 643838 + md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir + size: 536774 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: e87d96ed77d01ab2f24aeab5aaafe344.dir - size: 643838 + md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir + size: 536774 nfiles: 1 - path: data/prepared_data hash: md5 - md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir - size: 13238511 + md5: 3767eec56906f5ac724a3f07433645ef.dir + size: 13442342 nfiles: 2 - path: generate_metrics.py hash: md5 - md5: 78a9b9b25d0a7deaf44277f9afad5f98 - size: 4139 + md5: 8ce0b6b55e1688fca816985e0cf37f28 + size: 4220 params: configs/generate_metrics.yaml: dataclient_type: local @@ -113,7 +113,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: f494881710a057f90f82c0bd3a40a41d + md5: 852ef4cf2ca5e7f89d70420a9df7a596 size: 183 startup_cleanup: cmd: python startup_cleanup.py diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py new file mode 100644 index 0000000..1260d09 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -0,0 +1,177 @@ +""" +Doing some eda on dataset +""" +# Look at response variable + +from matplotlib import pyplot as plt +import pandas as pd + +train_df = pd.read_parquet("./data/prepared_data/train.parquet") +target = "SAP_ENDING" + +train_df = train_df.head(10000) + +# train_df[target].plot(kind='hist') + +# Plot the target variable +fig, ax = plt.subplots(figsize=(10, 7)) +ax.hist(train_df[target], bins=range(min(train_df[target]), max(train_df[target]))) + +fig + +# Find correlation to sale price (numeric) +train_df.dtypes +# All numerical + +train_df_corr = train_df.corr() + +train_df_corr.style.background_gradient(cmap="coolwarm") + +train_df_corr["EXTENSION_COUNT_ENDING"] + +# Check out some correlation plots between variables +# sap starting - negative correlation + +train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o") + +# head demand - light positive correlation +train_df[[target, "HEAT_DEMAND_STARTING"]].plot( + x=target, y="HEAT_DEMAND_STARTING", style="o" +) + +# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict + +# Load the autogluon model and check feature importance + + +import os +import yaml +import pandas as pd +from pathlib import Path +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.DataClient import dataclient_factory +from core.MLModels import model_factory +from core.Logger import logger + + +RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") + +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + +prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +prepare_data_params = yaml.safe_load(open(prepare_data_path)) + +build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" +build_model_params = yaml.safe_load(open(build_model_path)) + +generate_predictions_path = ( + Path(__file__).parent / "configs" / "generate_predictions.yaml" +) +generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) + +feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" +feature_process_params = yaml.safe_load(open(feature_process_path)) + +model = model_factory(build_model_params["model_type"]) +model_filepath = build_model_params["model_save_filepath"] + +model.load_model(model_filepath) + +fi = model.model.feature_importance(train_df.reset_index(drop=True)) + +pred = pd.read_parquet("./data/predictions/predictions.parquet") +test_df = pd.read_parquet("./data/prepared_data/test.parquet") + +# test_df = test_df.head(1000) + +test_df["predictions"] = pred["predictions"] + +test_df.groupby("PROPERTY_TYPE").apply( + lambda x: (x.SAP_ENDING - x.predictions).abs().mean() +) + +test_df.head() +flat_df = test_df[test_df["PROPERTY_TYPE"] == "Flat"] + +flat_df["residual"] = abs(flat_df["predictions"] - flat_df[target]) + +generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml" +generate_metrics_params = yaml.safe_load(open(generate_metrics_path)) +from core.MLMetrics import metrics_factory + +metrics = metrics_factory(generate_metrics_params["metrics_type"]) + +metrics_output = metrics.generate_metrics( + target=flat_df[target], + predictions=pd.Series(flat_df["predictions"]), +) + +# Use alibi to run permutation importance + +from alibi.explainers import PermutationImportance, plot_permutation_importance +from sklearn.metrics import mean_absolute_percentage_error +import numpy as np +import pandas as pd + +test_df = pd.read_parquet("./data/prepared_data/test.parquet") +test_df = test_df.head(1000) + +target = "SAP_ENDING" +feature_names = test_df.columns.to_list() +feature_names.remove(target) + +x = test_df[feature_names].to_numpy() +y = test_df[target].to_numpy() + + +def predict_fn(X: np.ndarray) -> np.ndarray: + return model.predict(pd.DataFrame(X, columns=feature_names)) + + +pfi = PermutationImportance( + predictor=predict_fn, + loss_fns=mean_absolute_percentage_error, + feature_names=feature_names, + verbose=True, +) + +exp = pfi.explain(x, y) +plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6}) + +[ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTITUENCY", + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "FIXED_LIGHTING_OUTLETS_COUNT", + "CONSTRUCTION_AGE_BAND", + "TRANSACTION_TYPE_STARTING", + "LIGHTING_DESCRIPTION_STARTING", + "MAINHEAT_DESCRIPTION_STARTING", + "HOTWATER_DESCRIPTION_STARTING", + "MAIN_FUEL_STARTING", + "MECHANICAL_VENTILATION_STARTING", + "SECONDHEAT_DESCRIPTION_STARTING", + "ENERGY_TARIFF_STARTING", + "SOLAR_WATER_HEATING_FLAG_STARTING", + "PHOTO_SUPPLY_STARTING", + "WINDOWS_DESCRIPTION_STARTING", + "GLAZED_TYPE_STARTING", + "MULTI_GLAZE_PROPORTION_STARTING", + "LOW_ENERGY_LIGHTING_STARTING", + "NUMBER_OPEN_FIREPLACES_STARTING", + "MAINHEATCONT_DESCRIPTION_STARTING", + "EXTENSION_COUNT_STARTING", + "TOTAL_FLOOR_AREA_STARTING", + "FLOOR_HEIGHT_STARTING", + "DAYS_TO_STARTING", + "WALLS_DESCRIPTION_STARTING", + "FLOOR_DESCRIPTION_STARTING", +] + +# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96 +# +# diff --git a/modules/ml-pipeline/src/pipeline/model_analysis.py b/modules/ml-pipeline/src/pipeline/model_analysis.py new file mode 100644 index 0000000..fb1f23c --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/model_analysis.py @@ -0,0 +1,150 @@ +""" +Post Model generation step: +We want to look at feature analysis of the model +""" + +import yaml +from pathlib import Path +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.Logger import logger +from core.MLModels import model_factory +from core.DataClient import dataclient_factory +from alibi.explainers import PermutationImportance, plot_permutation_importance +import numpy as np +import pandas as pd + + +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + +prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +prepare_data_params = yaml.safe_load(open(prepare_data_path)) + +feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" +feature_process_params = yaml.safe_load(open(feature_process_path)) + +build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" +build_model_params = yaml.safe_load(open(build_model_path)) + +model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml" +model_analysis_params = yaml.safe_load(open(model_analysis_path)) + +generate_predictions_path = ( + Path(__file__).parent / "configs" / "generate_predictions.yaml" +) +generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) + +model = model_factory(build_model_params["model_type"]) +model.load_model(build_model_params["model_save_filepath"]) + +dataclient_type = model_analysis_params["dataclient_type"] +dataclient = dataclient_factory( + dataclient_type=dataclient_type, + dataclient_config=client_params[dataclient_type], +) + + +feature_importance_filepath = model_analysis_params["feature_importance_filepath"] +permutation_subsample_amount = model_analysis_params["permutation_subsample_amount"] +loss_fns = model_analysis_params["loss_fns"] +feature_importance_column = model_analysis_params["feature_importance_column"] +n_repeats = model_analysis_params["n_repeats"] +figwidth = model_analysis_params["figwidth"] +figheight = model_analysis_params["figheight"] +target = feature_process_params["feature_processor_config"]["target"] +output_test_filepath = prepare_data_params["output_test_filepath"] + + +def model_analysis( + model: MLModel, + dataclient: DataClient, + target: str, + output_test_filepath: str, + feature_importance_filepath: str, + permutation_subsample_amount: int = 100, + loss_fns: str = "mean_absolute_percentage_error", + feature_importance_column: str = "importance", + n_repeats: int = 5, + figwidth: int = 7, + figheight: int = 6, +): + """ + Key task is to take in a model and generate: + - feature importance + and save these outputs + """ + + logger.info("------------------------------------") + logger.info(f"--- Generate Feature Importance ---") + logger.info("------------------------------------") + + test_df = dataclient.load_data(output_test_filepath) + + test_df = test_df.head(permutation_subsample_amount) + + feature_names = test_df.columns.to_list() + feature_names.remove(target) + + x = test_df[feature_names].to_numpy() + y = test_df[target].to_numpy() + + def predict_fn(X: np.ndarray) -> np.ndarray: + return model.predict(pd.DataFrame(X, columns=feature_names)) + + pfi = PermutationImportance( + predictor=predict_fn, + loss_fns=loss_fns, + feature_names=feature_names, + verbose=True, + ) + + logger.info( + f"Permutation feature importance - using {permutation_subsample_amount} samples and {n_repeats} shuffles per feature:" + ) + + exp = pfi.explain(x, y, n_repeats=n_repeats) + + mean_value_feature_importance = [ + element["mean"] for element in exp.data["feature_importance"][0] + ] + feature_importance_df = pd.DataFrame( + mean_value_feature_importance, + index=exp.data["feature_names"], + columns=[feature_importance_column], + ).sort_values(feature_importance_column, ascending=False) + + plot_permutation_importance( + exp, fig_kw={"figwidth": figwidth, "figheight": figheight} + ) + + logger.info("--------------------------------------") + logger.info(f"--- Save Feature Importance table ---") + logger.info("--------------------------------------") + + dataclient.save_data(feature_importance_df, location=feature_importance_filepath) + + +if __name__ == "__main__": + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + model_analysis( + model=model, + dataclient=dataclient, + target=target, + output_test_filepath=output_test_filepath, + feature_importance_filepath=feature_importance_filepath, + permutation_subsample_amount=permutation_subsample_amount, + loss_fns=loss_fns, + feature_importance_column=feature_importance_column, + n_repeats=n_repeats, + figwidth=figwidth, + figheight=figheight, + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py new file mode 100644 index 0000000..c65684f --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py @@ -0,0 +1,111 @@ +""" +Look at why the model made such a prediction +Manual script to run +Workflow: +- Identify a prediction row/s that you wish to look into + - i.e. a bad prediction/s +- Add these rows to the config +- Run script +""" + +import shap + +shap.initjs() + + +import yaml +from pathlib import Path +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.Logger import logger +from core.MLModels import model_factory +from core.DataClient import dataclient_factory +import numpy as np +import pandas as pd + + +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + +prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" +prepare_data_params = yaml.safe_load(open(prepare_data_path)) + +feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" +feature_process_params = yaml.safe_load(open(feature_process_path)) + +build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" +build_model_params = yaml.safe_load(open(build_model_path)) + +prediction_analysis_path = ( + Path(__file__).parent / "configs" / "prediction_analysis.yaml" +) +prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path)) + +model = model_factory(build_model_params["model_type"]) +model.load_model(build_model_params["model_save_filepath"]) + +dataclient_type = prediction_analysis_params["dataclient_type"] +dataclient = dataclient_factory( + dataclient_type=dataclient_type, + dataclient_config=client_params[dataclient_type], +) + +output_test_filepath = prepare_data_params["output_test_filepath"] + + +def prediction_analysis( + model: MLModel, dataclient: DataClient, output_test_filepath: str +): + + test_df = dataclient.load_data(output_test_filepath) + target = "SAP_ENDING" + test_df_without_target = test_df.drop(columns=[target]) + + # test_df_summary = shap.kmeans(test_df, 10) + # print("Baseline feature-values: \n", test_df_summary) + class AutogluonWrapper: + def __init__(self, predictor, feature_names): + self.ag_model = predictor + self.feature_names = feature_names + + def predict(self, X): + if isinstance(X, pd.Series): + X = X.values.reshape(1, -1) + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X, columns=self.feature_names) + return self.ag_model.predict(X) + + ag_wrapper = AutogluonWrapper( + model.model, feature_names=test_df_without_target.columns + ) + explainer = shap.KernelExplainer(ag_wrapper.predict, test_df_without_target) + + NSHAP_SAMPLES = 100 # how many samples to use to approximate each Shapely value, larger values will be slower + N_VAL = 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower + + ROW_INDEX = 0 # index of an example datapoint + single_datapoint = test_df_without_target.iloc[[ROW_INDEX]] + single_prediction = ag_wrapper.predict(single_datapoint) + + shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES) + shap.force_plot( + explainer.expected_value, + shap_values_single, + test_df_without_target.iloc[ROW_INDEX, :], + ) + ... + + +if __name__ == "__main__": + + logger.info("----------------------------") + logger.info(f"--- {__file__} - Start! ---") + logger.info("----------------------------") + + prediction_analysis( + model=model, dataclient=dataclient, output_test_filepath=output_test_filepath + ) + + logger.info("-------------------------------") + logger.info(f"--- {__file__} - Complete! ---") + logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/prepare_data.py b/modules/ml-pipeline/src/pipeline/prepare_data.py index 8caa101..f7bdbd1 100644 --- a/modules/ml-pipeline/src/pipeline/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/prepare_data.py @@ -74,6 +74,9 @@ def prepare_data( train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) ) + test = test.reset_index(drop=True) + + train = train.reset_index(drop=True) logger.info("-----------------------") logger.info("--- Outputting data ---") diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index b4679d0..e34d5af 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -2,6 +2,7 @@ joblib==1.3.2 boto3==1.28.17 pandas==1.5.3 autogluon==0.8.2 +alibi==0.9.4 pyarrow==13.0.0 pre-commit==3.3.3 sphinx==7.2.5