Merge pull request #37 from Hestia-Homes/model-test

Model test
This commit is contained in:
quandanrepo 2023-09-21 23:00:20 +01:00 committed by GitHub
commit 7729f96903
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 556 additions and 50 deletions

View file

@ -14,6 +14,6 @@ repos:
hooks:
- id: dvc-push-experiment
name: DVC - Push to experiment to remote location (experiments)
entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"'
entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"'
language: system
verbose: true

View file

@ -1,4 +1,5 @@
.dev_env/
.dev_env_pipeline/
__pycache__/
.DS_Store
.vscode/

View file

@ -68,13 +68,13 @@ def build_model(
data=train_data, target=target, model_hyperparameters=model_hyperparameters
)
logger.info("------------------------------")
logger.info("--- Generating predictions ---")
logger.info("------------------------------")
logger.info("----------------------------------")
logger.info("--- Generating fit predictions ---")
logger.info("----------------------------------")
prediction_data = train_data.drop(columns=target)
predictions = model.predict(data=prediction_data)
fit_predictions = model.predict(data=prediction_data)
logger.info("------------------------------")
logger.info("--- Generating fit metrics ---")
@ -82,7 +82,7 @@ def build_model(
metrics_output = metrics.generate_metrics(
target=train_data[target],
predictions=pd.Series(predictions),
predictions=pd.Series(fit_predictions),
)
logger.info("--------------------")

View file

@ -1,5 +1,5 @@
model_type: SKLearnLinearRegression
model_save_filepath: ./data/model/model.joblib
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
fit_metrics_filepath: ./metrics/fit_metrics.json
SKLearnLinearRegression: null
@ -12,5 +12,5 @@ AutogluonAutoML:
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 400
presets: high_quality
presets: good_quality
excluded_model_types: ['KNN']

View file

@ -2,7 +2,59 @@ feature_processor_type: dataframe
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: RDSAP_CHANGE
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"]
retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
target: SAP_ENDING
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE"]
# retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
# retain_features: null
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
# 'NUMBER_HEATED_ROOMS',
# 'FIXED_LIGHTING_OUTLETS_COUNT',
# 'CONSTRUCTION_AGE_BAND',
# 'TRANSACTION_TYPE_STARTING',
# 'LIGHTING_DESCRIPTION_STARTING',
# 'MAINHEAT_DESCRIPTION_STARTING',
# 'HOTWATER_DESCRIPTION_STARTING',
# 'MAIN_FUEL_STARTING',
# 'MECHANICAL_VENTILATION_STARTING',
# 'SECONDHEAT_DESCRIPTION_STARTING',
# 'ENERGY_TARIFF_STARTING',
# 'SOLAR_WATER_HEATING_FLAG_STARTING',
# 'PHOTO_SUPPLY_STARTING',
# 'WINDOWS_DESCRIPTION_STARTING',
# 'GLAZED_TYPE_STARTING',
# 'MULTI_GLAZE_PROPORTION_STARTING',
# 'LOW_ENERGY_LIGHTING_STARTING',
# 'NUMBER_OPEN_FIREPLACES_STARTING',
# 'MAINHEATCONT_DESCRIPTION_STARTING',
# 'EXTENSION_COUNT_STARTING',
# 'TOTAL_FLOOR_AREA_STARTING',
# 'FLOOR_HEIGHT_STARTING',
# 'DAYS_TO_STARTING',
# 'WALLS_DESCRIPTION_STARTING',
# 'FLOOR_DESCRIPTION_STARTING']
retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
'NUMBER_HEATED_ROOMS',
'FIXED_LIGHTING_OUTLETS_COUNT',
'CONSTRUCTION_AGE_BAND',
'TRANSACTION_TYPE_ENDING',
'LIGHTING_DESCRIPTION_ENDING',
'MAINHEAT_DESCRIPTION_ENDING',
'HOTWATER_DESCRIPTION_ENDING',
'MAIN_FUEL_ENDING',
'MECHANICAL_VENTILATION_ENDING',
'SECONDHEAT_DESCRIPTION_ENDING',
'ENERGY_TARIFF_ENDING',
'SOLAR_WATER_HEATING_FLAG_ENDING',
'PHOTO_SUPPLY_ENDING',
'WINDOWS_DESCRIPTION_ENDING',
'GLAZED_TYPE_ENDING',
'MULTI_GLAZE_PROPORTION_ENDING',
'LOW_ENERGY_LIGHTING_ENDING',
'NUMBER_OPEN_FIREPLACES_ENDING',
'MAINHEATCONT_DESCRIPTION_ENDING',
'EXTENSION_COUNT_ENDING',
'TOTAL_FLOOR_AREA_ENDING',
'FLOOR_HEIGHT_ENDING',
'DAYS_TO_ENDING',
'WALLS_DESCRIPTION_ENDING',
'FLOOR_DESCRIPTION_ENDING']

View file

@ -10,4 +10,10 @@ business_logic = {}
"""
New features dict + function
"""
new_feature_funcs = {}
def SAP_ENDING(df):
return df["SAP_STARTING"] + df["RDSAP_CHANGE"]
new_feature_funcs = {"SAP_ENDING": SAP_ENDING}

View file

@ -1,5 +1,3 @@
dataclient_type: local
input_datahandler_type: parquet
output_datahandler_type: json
metrics_type: Regression
metrics_output_filepath: ./metrics/metrics.json

View file

@ -0,0 +1,8 @@
dataclient_type: local
feature_importance_filepath: ./analysis/feature_importance.parquet
permutation_subsample_amount: 1000
loss_fns: "mean_absolute_percentage_error"
feature_importance_column: importance
n_repeats: 5
figwidth: 7
figheight: 6

View file

@ -1,6 +1,5 @@
input_dataclient_type: aws-s3
output_dataclient_type: local
datahandler_type: parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet

View file

@ -134,6 +134,8 @@ class DataFrameFeatureProcessor:
subsample_amount=feature_processor_config["subsample_amount"],
subsample_seed=feature_processor_config["subsample_seed"],
)
df = self.apply_business_logic(df, business_logic=business_logic)
df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
df = self.drop_unused_columns(
df, drop_columns=feature_processor_config["drop_columns"]
)
@ -142,6 +144,4 @@ class DataFrameFeatureProcessor:
retain_features=feature_processor_config["retain_features"],
target=feature_processor_config["target"],
)
df = self.apply_business_logic(df, business_logic=business_logic)
df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
return df

View file

@ -5,8 +5,8 @@ stages:
deps:
- path: prepare_data.py
hash: md5
md5: 7531a931a405650dc4e8b5d8c1fd3c66
size: 4959
md5: 934d774e67f38e440b621ce71152f5f6
size: 5031
params:
configs/prepare_data.yaml:
output_test_filepath: ./data/prepared_data/test.parquet
@ -15,20 +15,20 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
size: 13238511
md5: 3767eec56906f5ac724a3f07433645ef.dir
size: 13442342
nfiles: 2
build_model:
cmd: python build_model.py
deps:
- path: build_model.py
hash: md5
md5: c07ce0b8fdaf337ddfb7115684932157
size: 5048
md5: f9fa2a66d908b42ae196ce6f0f782258
size: 5134
- path: data/prepared_data
hash: md5
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
size: 13238511
md5: 3767eec56906f5ac724a3f07433645ef.dir
size: 13442342
nfiles: 2
params:
configs/build_model.yaml:
@ -37,42 +37,42 @@ stages:
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 400
presets: high_quality
presets: good_quality
excluded_model_types:
- KNN
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
fit_metrics_filepath: ./metrics/fit_metrics.json
model_save_filepath: ./data/model/model.joblib
model_type: SKLearnLinearRegression
model_save_filepath: ./data/model/autogluonmodel/
model_type: AutogluonAutoML
outs:
- path: data/model/
hash: md5
md5: 2ace0835c28543512982b69d383b3c49.dir
size: 1832
nfiles: 1
md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir
size: 118227750
nfiles: 71
- path: metrics/fit_metrics.json
hash: md5
md5: c8c5a40863e2ced7f5f5a844ba203d80
size: 180
md5: e1c9a16617804f48e8ffac7cec6575ca
size: 185
generate_predictions:
cmd: python generate_predictions.py
deps:
- path: data/model
hash: md5
md5: 2ace0835c28543512982b69d383b3c49.dir
size: 1832
nfiles: 1
md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir
size: 118227750
nfiles: 71
- path: data/prepared_data
hash: md5
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
size: 13238511
md5: 3767eec56906f5ac724a3f07433645ef.dir
size: 13442342
nfiles: 2
- path: generate_predictions.py
hash: md5
md5: ab603e9a526a73f2fe17603e6fe6c0a4
size: 4261
md5: a25c4611ff467cdc1c921918112a30fe
size: 4311
params:
configs/generate_predictions.yaml:
input_dataclient_type: local
@ -83,26 +83,26 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
size: 643838
md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir
size: 536774
nfiles: 1
generate_metrics:
cmd: python generate_metrics.py
deps:
- path: data/predictions
hash: md5
md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
size: 643838
md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir
size: 536774
nfiles: 1
- path: data/prepared_data
hash: md5
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
size: 13238511
md5: 3767eec56906f5ac724a3f07433645ef.dir
size: 13442342
nfiles: 2
- path: generate_metrics.py
hash: md5
md5: 78a9b9b25d0a7deaf44277f9afad5f98
size: 4139
md5: 8ce0b6b55e1688fca816985e0cf37f28
size: 4220
params:
configs/generate_metrics.yaml:
dataclient_type: local
@ -113,7 +113,7 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: f494881710a057f90f82c0bd3a40a41d
md5: 852ef4cf2ca5e7f89d70420a9df7a596
size: 183
startup_cleanup:
cmd: python startup_cleanup.py

View file

@ -0,0 +1,177 @@
"""
Doing some eda on dataset
"""
# Look at response variable
from matplotlib import pyplot as plt
import pandas as pd
train_df = pd.read_parquet("./data/prepared_data/train.parquet")
target = "SAP_ENDING"
train_df = train_df.head(10000)
# train_df[target].plot(kind='hist')
# Plot the target variable
fig, ax = plt.subplots(figsize=(10, 7))
ax.hist(train_df[target], bins=range(min(train_df[target]), max(train_df[target])))
fig
# Find correlation to sale price (numeric)
train_df.dtypes
# All numerical
train_df_corr = train_df.corr()
train_df_corr.style.background_gradient(cmap="coolwarm")
train_df_corr["EXTENSION_COUNT_ENDING"]
# Check out some correlation plots between variables
# sap starting - negative correlation
train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
# head demand - light positive correlation
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o"
)
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.Logger import logger
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
model = model_factory(build_model_params["model_type"])
model_filepath = build_model_params["model_save_filepath"]
model.load_model(model_filepath)
fi = model.model.feature_importance(train_df.reset_index(drop=True))
pred = pd.read_parquet("./data/predictions/predictions.parquet")
test_df = pd.read_parquet("./data/prepared_data/test.parquet")
# test_df = test_df.head(1000)
test_df["predictions"] = pred["predictions"]
test_df.groupby("PROPERTY_TYPE").apply(
lambda x: (x.SAP_ENDING - x.predictions).abs().mean()
)
test_df.head()
flat_df = test_df[test_df["PROPERTY_TYPE"] == "Flat"]
flat_df["residual"] = abs(flat_df["predictions"] - flat_df[target])
generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
from core.MLMetrics import metrics_factory
metrics = metrics_factory(generate_metrics_params["metrics_type"])
metrics_output = metrics.generate_metrics(
target=flat_df[target],
predictions=pd.Series(flat_df["predictions"]),
)
# Use alibi to run permutation importance
from alibi.explainers import PermutationImportance, plot_permutation_importance
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import pandas as pd
test_df = pd.read_parquet("./data/prepared_data/test.parquet")
test_df = test_df.head(1000)
target = "SAP_ENDING"
feature_names = test_df.columns.to_list()
feature_names.remove(target)
x = test_df[feature_names].to_numpy()
y = test_df[target].to_numpy()
def predict_fn(X: np.ndarray) -> np.ndarray:
return model.predict(pd.DataFrame(X, columns=feature_names))
pfi = PermutationImportance(
predictor=predict_fn,
loss_fns=mean_absolute_percentage_error,
feature_names=feature_names,
verbose=True,
)
exp = pfi.explain(x, y)
plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
[
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTITUENCY",
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
"CONSTRUCTION_AGE_BAND",
"TRANSACTION_TYPE_STARTING",
"LIGHTING_DESCRIPTION_STARTING",
"MAINHEAT_DESCRIPTION_STARTING",
"HOTWATER_DESCRIPTION_STARTING",
"MAIN_FUEL_STARTING",
"MECHANICAL_VENTILATION_STARTING",
"SECONDHEAT_DESCRIPTION_STARTING",
"ENERGY_TARIFF_STARTING",
"SOLAR_WATER_HEATING_FLAG_STARTING",
"PHOTO_SUPPLY_STARTING",
"WINDOWS_DESCRIPTION_STARTING",
"GLAZED_TYPE_STARTING",
"MULTI_GLAZE_PROPORTION_STARTING",
"LOW_ENERGY_LIGHTING_STARTING",
"NUMBER_OPEN_FIREPLACES_STARTING",
"MAINHEATCONT_DESCRIPTION_STARTING",
"EXTENSION_COUNT_STARTING",
"TOTAL_FLOOR_AREA_STARTING",
"FLOOR_HEIGHT_STARTING",
"DAYS_TO_STARTING",
"WALLS_DESCRIPTION_STARTING",
"FLOOR_DESCRIPTION_STARTING",
]
# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
#
#

View file

@ -0,0 +1,150 @@
"""
Post Model generation step:
We want to look at feature analysis of the model
"""
import yaml
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.Logger import logger
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
from alibi.explainers import PermutationImportance, plot_permutation_importance
import numpy as np
import pandas as pd
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
model_analysis_params = yaml.safe_load(open(model_analysis_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])
dataclient_type = model_analysis_params["dataclient_type"]
dataclient = dataclient_factory(
dataclient_type=dataclient_type,
dataclient_config=client_params[dataclient_type],
)
feature_importance_filepath = model_analysis_params["feature_importance_filepath"]
permutation_subsample_amount = model_analysis_params["permutation_subsample_amount"]
loss_fns = model_analysis_params["loss_fns"]
feature_importance_column = model_analysis_params["feature_importance_column"]
n_repeats = model_analysis_params["n_repeats"]
figwidth = model_analysis_params["figwidth"]
figheight = model_analysis_params["figheight"]
target = feature_process_params["feature_processor_config"]["target"]
output_test_filepath = prepare_data_params["output_test_filepath"]
def model_analysis(
model: MLModel,
dataclient: DataClient,
target: str,
output_test_filepath: str,
feature_importance_filepath: str,
permutation_subsample_amount: int = 100,
loss_fns: str = "mean_absolute_percentage_error",
feature_importance_column: str = "importance",
n_repeats: int = 5,
figwidth: int = 7,
figheight: int = 6,
):
"""
Key task is to take in a model and generate:
- feature importance
and save these outputs
"""
logger.info("------------------------------------")
logger.info(f"--- Generate Feature Importance ---")
logger.info("------------------------------------")
test_df = dataclient.load_data(output_test_filepath)
test_df = test_df.head(permutation_subsample_amount)
feature_names = test_df.columns.to_list()
feature_names.remove(target)
x = test_df[feature_names].to_numpy()
y = test_df[target].to_numpy()
def predict_fn(X: np.ndarray) -> np.ndarray:
return model.predict(pd.DataFrame(X, columns=feature_names))
pfi = PermutationImportance(
predictor=predict_fn,
loss_fns=loss_fns,
feature_names=feature_names,
verbose=True,
)
logger.info(
f"Permutation feature importance - using {permutation_subsample_amount} samples and {n_repeats} shuffles per feature:"
)
exp = pfi.explain(x, y, n_repeats=n_repeats)
mean_value_feature_importance = [
element["mean"] for element in exp.data["feature_importance"][0]
]
feature_importance_df = pd.DataFrame(
mean_value_feature_importance,
index=exp.data["feature_names"],
columns=[feature_importance_column],
).sort_values(feature_importance_column, ascending=False)
plot_permutation_importance(
exp, fig_kw={"figwidth": figwidth, "figheight": figheight}
)
logger.info("--------------------------------------")
logger.info(f"--- Save Feature Importance table ---")
logger.info("--------------------------------------")
dataclient.save_data(feature_importance_df, location=feature_importance_filepath)
if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
model_analysis(
model=model,
dataclient=dataclient,
target=target,
output_test_filepath=output_test_filepath,
feature_importance_filepath=feature_importance_filepath,
permutation_subsample_amount=permutation_subsample_amount,
loss_fns=loss_fns,
feature_importance_column=feature_importance_column,
n_repeats=n_repeats,
figwidth=figwidth,
figheight=figheight,
)
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -0,0 +1,111 @@
"""
Look at why the model made such a prediction
Manual script to run
Workflow:
- Identify a prediction row/s that you wish to look into
- i.e. a bad prediction/s
- Add these rows to the config
- Run script
"""
import shap
shap.initjs()
import yaml
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.Logger import logger
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import numpy as np
import pandas as pd
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
prediction_analysis_path = (
Path(__file__).parent / "configs" / "prediction_analysis.yaml"
)
prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path))
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])
dataclient_type = prediction_analysis_params["dataclient_type"]
dataclient = dataclient_factory(
dataclient_type=dataclient_type,
dataclient_config=client_params[dataclient_type],
)
output_test_filepath = prepare_data_params["output_test_filepath"]
def prediction_analysis(
model: MLModel, dataclient: DataClient, output_test_filepath: str
):
test_df = dataclient.load_data(output_test_filepath)
target = "SAP_ENDING"
test_df_without_target = test_df.drop(columns=[target])
# test_df_summary = shap.kmeans(test_df, 10)
# print("Baseline feature-values: \n", test_df_summary)
class AutogluonWrapper:
def __init__(self, predictor, feature_names):
self.ag_model = predictor
self.feature_names = feature_names
def predict(self, X):
if isinstance(X, pd.Series):
X = X.values.reshape(1, -1)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.feature_names)
return self.ag_model.predict(X)
ag_wrapper = AutogluonWrapper(
model.model, feature_names=test_df_without_target.columns
)
explainer = shap.KernelExplainer(ag_wrapper.predict, test_df_without_target)
NSHAP_SAMPLES = 100 # how many samples to use to approximate each Shapely value, larger values will be slower
N_VAL = 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
ROW_INDEX = 0 # index of an example datapoint
single_datapoint = test_df_without_target.iloc[[ROW_INDEX]]
single_prediction = ag_wrapper.predict(single_datapoint)
shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES)
shap.force_plot(
explainer.expected_value,
shap_values_single,
test_df_without_target.iloc[ROW_INDEX, :],
)
...
if __name__ == "__main__":
logger.info("----------------------------")
logger.info(f"--- {__file__} - Start! ---")
logger.info("----------------------------")
prediction_analysis(
model=model, dataclient=dataclient, output_test_filepath=output_test_filepath
)
logger.info("-------------------------------")
logger.info(f"--- {__file__} - Complete! ---")
logger.info("-------------------------------")

View file

@ -74,6 +74,9 @@ def prepare_data(
train, test = train_test_split(
data, train_size=train_proportion, test_size=(1 - train_proportion)
)
test = test.reset_index(drop=True)
train = train.reset_index(drop=True)
logger.info("-----------------------")
logger.info("--- Outputting data ---")

View file

@ -2,6 +2,7 @@ joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
autogluon==0.8.2
alibi==0.9.4
pyarrow==13.0.0
pre-commit==3.3.3
sphinx==7.2.5