mirror of
https://github.com/Hestia-Homes/ML.git
synced 2026-06-08 11:17:25 +00:00
commit
7729f96903
17 changed files with 556 additions and 50 deletions
|
|
@ -14,6 +14,6 @@ repos:
|
|||
hooks:
|
||||
- id: dvc-push-experiment
|
||||
name: DVC - Push to experiment to remote location (experiments)
|
||||
entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"'
|
||||
entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"'
|
||||
language: system
|
||||
verbose: true
|
||||
|
|
|
|||
1
modules/ml-pipeline/.gitignore
vendored
1
modules/ml-pipeline/.gitignore
vendored
|
|
@ -1,4 +1,5 @@
|
|||
.dev_env/
|
||||
.dev_env_pipeline/
|
||||
__pycache__/
|
||||
.DS_Store
|
||||
.vscode/
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -68,13 +68,13 @@ def build_model(
|
|||
data=train_data, target=target, model_hyperparameters=model_hyperparameters
|
||||
)
|
||||
|
||||
logger.info("------------------------------")
|
||||
logger.info("--- Generating predictions ---")
|
||||
logger.info("------------------------------")
|
||||
logger.info("----------------------------------")
|
||||
logger.info("--- Generating fit predictions ---")
|
||||
logger.info("----------------------------------")
|
||||
|
||||
prediction_data = train_data.drop(columns=target)
|
||||
|
||||
predictions = model.predict(data=prediction_data)
|
||||
fit_predictions = model.predict(data=prediction_data)
|
||||
|
||||
logger.info("------------------------------")
|
||||
logger.info("--- Generating fit metrics ---")
|
||||
|
|
@ -82,7 +82,7 @@ def build_model(
|
|||
|
||||
metrics_output = metrics.generate_metrics(
|
||||
target=train_data[target],
|
||||
predictions=pd.Series(predictions),
|
||||
predictions=pd.Series(fit_predictions),
|
||||
)
|
||||
|
||||
logger.info("--------------------")
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
model_type: SKLearnLinearRegression
|
||||
model_save_filepath: ./data/model/model.joblib
|
||||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
fit_metrics_filepath: ./metrics/fit_metrics.json
|
||||
|
||||
SKLearnLinearRegression: null
|
||||
|
|
@ -12,5 +12,5 @@ AutogluonAutoML:
|
|||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 400
|
||||
presets: high_quality
|
||||
presets: good_quality
|
||||
excluded_model_types: ['KNN']
|
||||
|
|
|
|||
|
|
@ -2,7 +2,59 @@ feature_processor_type: dataframe
|
|||
feature_processor_config:
|
||||
subsample_amount: null
|
||||
subsample_seed: 0
|
||||
target: RDSAP_CHANGE
|
||||
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"]
|
||||
retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
|
||||
target: SAP_ENDING
|
||||
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE"]
|
||||
# retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
|
||||
# retain_features: null
|
||||
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
|
||||
# 'NUMBER_HEATED_ROOMS',
|
||||
# 'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
# 'CONSTRUCTION_AGE_BAND',
|
||||
# 'TRANSACTION_TYPE_STARTING',
|
||||
# 'LIGHTING_DESCRIPTION_STARTING',
|
||||
# 'MAINHEAT_DESCRIPTION_STARTING',
|
||||
# 'HOTWATER_DESCRIPTION_STARTING',
|
||||
# 'MAIN_FUEL_STARTING',
|
||||
# 'MECHANICAL_VENTILATION_STARTING',
|
||||
# 'SECONDHEAT_DESCRIPTION_STARTING',
|
||||
# 'ENERGY_TARIFF_STARTING',
|
||||
# 'SOLAR_WATER_HEATING_FLAG_STARTING',
|
||||
# 'PHOTO_SUPPLY_STARTING',
|
||||
# 'WINDOWS_DESCRIPTION_STARTING',
|
||||
# 'GLAZED_TYPE_STARTING',
|
||||
# 'MULTI_GLAZE_PROPORTION_STARTING',
|
||||
# 'LOW_ENERGY_LIGHTING_STARTING',
|
||||
# 'NUMBER_OPEN_FIREPLACES_STARTING',
|
||||
# 'MAINHEATCONT_DESCRIPTION_STARTING',
|
||||
# 'EXTENSION_COUNT_STARTING',
|
||||
# 'TOTAL_FLOOR_AREA_STARTING',
|
||||
# 'FLOOR_HEIGHT_STARTING',
|
||||
# 'DAYS_TO_STARTING',
|
||||
# 'WALLS_DESCRIPTION_STARTING',
|
||||
# 'FLOOR_DESCRIPTION_STARTING']
|
||||
retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
|
||||
'NUMBER_HEATED_ROOMS',
|
||||
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
'CONSTRUCTION_AGE_BAND',
|
||||
'TRANSACTION_TYPE_ENDING',
|
||||
'LIGHTING_DESCRIPTION_ENDING',
|
||||
'MAINHEAT_DESCRIPTION_ENDING',
|
||||
'HOTWATER_DESCRIPTION_ENDING',
|
||||
'MAIN_FUEL_ENDING',
|
||||
'MECHANICAL_VENTILATION_ENDING',
|
||||
'SECONDHEAT_DESCRIPTION_ENDING',
|
||||
'ENERGY_TARIFF_ENDING',
|
||||
'SOLAR_WATER_HEATING_FLAG_ENDING',
|
||||
'PHOTO_SUPPLY_ENDING',
|
||||
'WINDOWS_DESCRIPTION_ENDING',
|
||||
'GLAZED_TYPE_ENDING',
|
||||
'MULTI_GLAZE_PROPORTION_ENDING',
|
||||
'LOW_ENERGY_LIGHTING_ENDING',
|
||||
'NUMBER_OPEN_FIREPLACES_ENDING',
|
||||
'MAINHEATCONT_DESCRIPTION_ENDING',
|
||||
'EXTENSION_COUNT_ENDING',
|
||||
'TOTAL_FLOOR_AREA_ENDING',
|
||||
'FLOOR_HEIGHT_ENDING',
|
||||
'DAYS_TO_ENDING',
|
||||
'WALLS_DESCRIPTION_ENDING',
|
||||
'FLOOR_DESCRIPTION_ENDING']
|
||||
|
|
|
|||
|
|
@ -10,4 +10,10 @@ business_logic = {}
|
|||
"""
|
||||
New features dict + function
|
||||
"""
|
||||
new_feature_funcs = {}
|
||||
|
||||
|
||||
def SAP_ENDING(df):
|
||||
return df["SAP_STARTING"] + df["RDSAP_CHANGE"]
|
||||
|
||||
|
||||
new_feature_funcs = {"SAP_ENDING": SAP_ENDING}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
dataclient_type: local
|
||||
input_datahandler_type: parquet
|
||||
output_datahandler_type: json
|
||||
metrics_type: Regression
|
||||
metrics_output_filepath: ./metrics/metrics.json
|
||||
|
|
|
|||
|
|
@ -0,0 +1,8 @@
|
|||
dataclient_type: local
|
||||
feature_importance_filepath: ./analysis/feature_importance.parquet
|
||||
permutation_subsample_amount: 1000
|
||||
loss_fns: "mean_absolute_percentage_error"
|
||||
feature_importance_column: importance
|
||||
n_repeats: 5
|
||||
figwidth: 7
|
||||
figheight: 6
|
||||
|
|
@ -1,6 +1,5 @@
|
|||
input_dataclient_type: aws-s3
|
||||
output_dataclient_type: local
|
||||
datahandler_type: parquet
|
||||
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
|
||||
train_proportion: 0.9
|
||||
output_train_filepath: ./data/prepared_data/train.parquet
|
||||
|
|
|
|||
|
|
@ -134,6 +134,8 @@ class DataFrameFeatureProcessor:
|
|||
subsample_amount=feature_processor_config["subsample_amount"],
|
||||
subsample_seed=feature_processor_config["subsample_seed"],
|
||||
)
|
||||
df = self.apply_business_logic(df, business_logic=business_logic)
|
||||
df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
|
||||
df = self.drop_unused_columns(
|
||||
df, drop_columns=feature_processor_config["drop_columns"]
|
||||
)
|
||||
|
|
@ -142,6 +144,4 @@ class DataFrameFeatureProcessor:
|
|||
retain_features=feature_processor_config["retain_features"],
|
||||
target=feature_processor_config["target"],
|
||||
)
|
||||
df = self.apply_business_logic(df, business_logic=business_logic)
|
||||
df = self.generate_new_features(df, new_feature_funcs=new_feature_funcs)
|
||||
return df
|
||||
|
|
|
|||
|
|
@ -5,8 +5,8 @@ stages:
|
|||
deps:
|
||||
- path: prepare_data.py
|
||||
hash: md5
|
||||
md5: 7531a931a405650dc4e8b5d8c1fd3c66
|
||||
size: 4959
|
||||
md5: 934d774e67f38e440b621ce71152f5f6
|
||||
size: 5031
|
||||
params:
|
||||
configs/prepare_data.yaml:
|
||||
output_test_filepath: ./data/prepared_data/test.parquet
|
||||
|
|
@ -15,20 +15,20 @@ stages:
|
|||
outs:
|
||||
- path: data/prepared_data/
|
||||
hash: md5
|
||||
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
|
||||
size: 13238511
|
||||
md5: 3767eec56906f5ac724a3f07433645ef.dir
|
||||
size: 13442342
|
||||
nfiles: 2
|
||||
build_model:
|
||||
cmd: python build_model.py
|
||||
deps:
|
||||
- path: build_model.py
|
||||
hash: md5
|
||||
md5: c07ce0b8fdaf337ddfb7115684932157
|
||||
size: 5048
|
||||
md5: f9fa2a66d908b42ae196ce6f0f782258
|
||||
size: 5134
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
|
||||
size: 13238511
|
||||
md5: 3767eec56906f5ac724a3f07433645ef.dir
|
||||
size: 13442342
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/build_model.yaml:
|
||||
|
|
@ -37,42 +37,42 @@ stages:
|
|||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 400
|
||||
presets: high_quality
|
||||
presets: good_quality
|
||||
excluded_model_types:
|
||||
- KNN
|
||||
SKLearnLinearRegression:
|
||||
SKLearnSVMRegression:
|
||||
kernel: linear
|
||||
fit_metrics_filepath: ./metrics/fit_metrics.json
|
||||
model_save_filepath: ./data/model/model.joblib
|
||||
model_type: SKLearnLinearRegression
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
model_type: AutogluonAutoML
|
||||
outs:
|
||||
- path: data/model/
|
||||
hash: md5
|
||||
md5: 2ace0835c28543512982b69d383b3c49.dir
|
||||
size: 1832
|
||||
nfiles: 1
|
||||
md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir
|
||||
size: 118227750
|
||||
nfiles: 71
|
||||
- path: metrics/fit_metrics.json
|
||||
hash: md5
|
||||
md5: c8c5a40863e2ced7f5f5a844ba203d80
|
||||
size: 180
|
||||
md5: e1c9a16617804f48e8ffac7cec6575ca
|
||||
size: 185
|
||||
generate_predictions:
|
||||
cmd: python generate_predictions.py
|
||||
deps:
|
||||
- path: data/model
|
||||
hash: md5
|
||||
md5: 2ace0835c28543512982b69d383b3c49.dir
|
||||
size: 1832
|
||||
nfiles: 1
|
||||
md5: 7b2f8334c81fb5ff23e42e77741b31d1.dir
|
||||
size: 118227750
|
||||
nfiles: 71
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
|
||||
size: 13238511
|
||||
md5: 3767eec56906f5ac724a3f07433645ef.dir
|
||||
size: 13442342
|
||||
nfiles: 2
|
||||
- path: generate_predictions.py
|
||||
hash: md5
|
||||
md5: ab603e9a526a73f2fe17603e6fe6c0a4
|
||||
size: 4261
|
||||
md5: a25c4611ff467cdc1c921918112a30fe
|
||||
size: 4311
|
||||
params:
|
||||
configs/generate_predictions.yaml:
|
||||
input_dataclient_type: local
|
||||
|
|
@ -83,26 +83,26 @@ stages:
|
|||
outs:
|
||||
- path: data/predictions/
|
||||
hash: md5
|
||||
md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
|
||||
size: 643838
|
||||
md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir
|
||||
size: 536774
|
||||
nfiles: 1
|
||||
generate_metrics:
|
||||
cmd: python generate_metrics.py
|
||||
deps:
|
||||
- path: data/predictions
|
||||
hash: md5
|
||||
md5: e87d96ed77d01ab2f24aeab5aaafe344.dir
|
||||
size: 643838
|
||||
md5: fb7cf3f4a90598ec1e43a1b7a4af3bef.dir
|
||||
size: 536774
|
||||
nfiles: 1
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: e36ed6e937196ab64dcfe9b5b97b6e9f.dir
|
||||
size: 13238511
|
||||
md5: 3767eec56906f5ac724a3f07433645ef.dir
|
||||
size: 13442342
|
||||
nfiles: 2
|
||||
- path: generate_metrics.py
|
||||
hash: md5
|
||||
md5: 78a9b9b25d0a7deaf44277f9afad5f98
|
||||
size: 4139
|
||||
md5: 8ce0b6b55e1688fca816985e0cf37f28
|
||||
size: 4220
|
||||
params:
|
||||
configs/generate_metrics.yaml:
|
||||
dataclient_type: local
|
||||
|
|
@ -113,7 +113,7 @@ stages:
|
|||
outs:
|
||||
- path: metrics/metrics.json
|
||||
hash: md5
|
||||
md5: f494881710a057f90f82c0bd3a40a41d
|
||||
md5: 852ef4cf2ca5e7f89d70420a9df7a596
|
||||
size: 183
|
||||
startup_cleanup:
|
||||
cmd: python startup_cleanup.py
|
||||
|
|
|
|||
177
modules/ml-pipeline/src/pipeline/eda.py
Normal file
177
modules/ml-pipeline/src/pipeline/eda.py
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
"""
|
||||
Doing some eda on dataset
|
||||
"""
|
||||
# Look at response variable
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
train_df = pd.read_parquet("./data/prepared_data/train.parquet")
|
||||
target = "SAP_ENDING"
|
||||
|
||||
train_df = train_df.head(10000)
|
||||
|
||||
# train_df[target].plot(kind='hist')
|
||||
|
||||
# Plot the target variable
|
||||
fig, ax = plt.subplots(figsize=(10, 7))
|
||||
ax.hist(train_df[target], bins=range(min(train_df[target]), max(train_df[target])))
|
||||
|
||||
fig
|
||||
|
||||
# Find correlation to sale price (numeric)
|
||||
train_df.dtypes
|
||||
# All numerical
|
||||
|
||||
train_df_corr = train_df.corr()
|
||||
|
||||
train_df_corr.style.background_gradient(cmap="coolwarm")
|
||||
|
||||
train_df_corr["EXTENSION_COUNT_ENDING"]
|
||||
|
||||
# Check out some correlation plots between variables
|
||||
# sap starting - negative correlation
|
||||
|
||||
train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
|
||||
|
||||
# head demand - light positive correlation
|
||||
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
|
||||
x=target, y="HEAT_DEMAND_STARTING", style="o"
|
||||
)
|
||||
|
||||
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
|
||||
|
||||
# Load the autogluon model and check feature importance
|
||||
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from core.interface.InterfaceModels import MLModel
|
||||
from core.interface.InterfaceDataClient import DataClient
|
||||
from core.DataClient import dataclient_factory
|
||||
from core.MLModels import model_factory
|
||||
from core.Logger import logger
|
||||
|
||||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
|
||||
build_model_params = yaml.safe_load(open(build_model_path))
|
||||
|
||||
generate_predictions_path = (
|
||||
Path(__file__).parent / "configs" / "generate_predictions.yaml"
|
||||
)
|
||||
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
model_filepath = build_model_params["model_save_filepath"]
|
||||
|
||||
model.load_model(model_filepath)
|
||||
|
||||
fi = model.model.feature_importance(train_df.reset_index(drop=True))
|
||||
|
||||
pred = pd.read_parquet("./data/predictions/predictions.parquet")
|
||||
test_df = pd.read_parquet("./data/prepared_data/test.parquet")
|
||||
|
||||
# test_df = test_df.head(1000)
|
||||
|
||||
test_df["predictions"] = pred["predictions"]
|
||||
|
||||
test_df.groupby("PROPERTY_TYPE").apply(
|
||||
lambda x: (x.SAP_ENDING - x.predictions).abs().mean()
|
||||
)
|
||||
|
||||
test_df.head()
|
||||
flat_df = test_df[test_df["PROPERTY_TYPE"] == "Flat"]
|
||||
|
||||
flat_df["residual"] = abs(flat_df["predictions"] - flat_df[target])
|
||||
|
||||
generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
|
||||
generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
|
||||
from core.MLMetrics import metrics_factory
|
||||
|
||||
metrics = metrics_factory(generate_metrics_params["metrics_type"])
|
||||
|
||||
metrics_output = metrics.generate_metrics(
|
||||
target=flat_df[target],
|
||||
predictions=pd.Series(flat_df["predictions"]),
|
||||
)
|
||||
|
||||
# Use alibi to run permutation importance
|
||||
|
||||
from alibi.explainers import PermutationImportance, plot_permutation_importance
|
||||
from sklearn.metrics import mean_absolute_percentage_error
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
test_df = pd.read_parquet("./data/prepared_data/test.parquet")
|
||||
test_df = test_df.head(1000)
|
||||
|
||||
target = "SAP_ENDING"
|
||||
feature_names = test_df.columns.to_list()
|
||||
feature_names.remove(target)
|
||||
|
||||
x = test_df[feature_names].to_numpy()
|
||||
y = test_df[target].to_numpy()
|
||||
|
||||
|
||||
def predict_fn(X: np.ndarray) -> np.ndarray:
|
||||
return model.predict(pd.DataFrame(X, columns=feature_names))
|
||||
|
||||
|
||||
pfi = PermutationImportance(
|
||||
predictor=predict_fn,
|
||||
loss_fns=mean_absolute_percentage_error,
|
||||
feature_names=feature_names,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
exp = pfi.explain(x, y)
|
||||
plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
|
||||
|
||||
[
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTITUENCY",
|
||||
"NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS",
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"TRANSACTION_TYPE_STARTING",
|
||||
"LIGHTING_DESCRIPTION_STARTING",
|
||||
"MAINHEAT_DESCRIPTION_STARTING",
|
||||
"HOTWATER_DESCRIPTION_STARTING",
|
||||
"MAIN_FUEL_STARTING",
|
||||
"MECHANICAL_VENTILATION_STARTING",
|
||||
"SECONDHEAT_DESCRIPTION_STARTING",
|
||||
"ENERGY_TARIFF_STARTING",
|
||||
"SOLAR_WATER_HEATING_FLAG_STARTING",
|
||||
"PHOTO_SUPPLY_STARTING",
|
||||
"WINDOWS_DESCRIPTION_STARTING",
|
||||
"GLAZED_TYPE_STARTING",
|
||||
"MULTI_GLAZE_PROPORTION_STARTING",
|
||||
"LOW_ENERGY_LIGHTING_STARTING",
|
||||
"NUMBER_OPEN_FIREPLACES_STARTING",
|
||||
"MAINHEATCONT_DESCRIPTION_STARTING",
|
||||
"EXTENSION_COUNT_STARTING",
|
||||
"TOTAL_FLOOR_AREA_STARTING",
|
||||
"FLOOR_HEIGHT_STARTING",
|
||||
"DAYS_TO_STARTING",
|
||||
"WALLS_DESCRIPTION_STARTING",
|
||||
"FLOOR_DESCRIPTION_STARTING",
|
||||
]
|
||||
|
||||
# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
|
||||
#
|
||||
#
|
||||
150
modules/ml-pipeline/src/pipeline/model_analysis.py
Normal file
150
modules/ml-pipeline/src/pipeline/model_analysis.py
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
"""
|
||||
Post Model generation step:
|
||||
We want to look at feature analysis of the model
|
||||
"""
|
||||
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from core.interface.InterfaceModels import MLModel
|
||||
from core.interface.InterfaceDataClient import DataClient
|
||||
from core.Logger import logger
|
||||
from core.MLModels import model_factory
|
||||
from core.DataClient import dataclient_factory
|
||||
from alibi.explainers import PermutationImportance, plot_permutation_importance
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
|
||||
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
|
||||
build_model_params = yaml.safe_load(open(build_model_path))
|
||||
|
||||
model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
|
||||
model_analysis_params = yaml.safe_load(open(model_analysis_path))
|
||||
|
||||
generate_predictions_path = (
|
||||
Path(__file__).parent / "configs" / "generate_predictions.yaml"
|
||||
)
|
||||
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
model.load_model(build_model_params["model_save_filepath"])
|
||||
|
||||
dataclient_type = model_analysis_params["dataclient_type"]
|
||||
dataclient = dataclient_factory(
|
||||
dataclient_type=dataclient_type,
|
||||
dataclient_config=client_params[dataclient_type],
|
||||
)
|
||||
|
||||
|
||||
feature_importance_filepath = model_analysis_params["feature_importance_filepath"]
|
||||
permutation_subsample_amount = model_analysis_params["permutation_subsample_amount"]
|
||||
loss_fns = model_analysis_params["loss_fns"]
|
||||
feature_importance_column = model_analysis_params["feature_importance_column"]
|
||||
n_repeats = model_analysis_params["n_repeats"]
|
||||
figwidth = model_analysis_params["figwidth"]
|
||||
figheight = model_analysis_params["figheight"]
|
||||
target = feature_process_params["feature_processor_config"]["target"]
|
||||
output_test_filepath = prepare_data_params["output_test_filepath"]
|
||||
|
||||
|
||||
def model_analysis(
|
||||
model: MLModel,
|
||||
dataclient: DataClient,
|
||||
target: str,
|
||||
output_test_filepath: str,
|
||||
feature_importance_filepath: str,
|
||||
permutation_subsample_amount: int = 100,
|
||||
loss_fns: str = "mean_absolute_percentage_error",
|
||||
feature_importance_column: str = "importance",
|
||||
n_repeats: int = 5,
|
||||
figwidth: int = 7,
|
||||
figheight: int = 6,
|
||||
):
|
||||
"""
|
||||
Key task is to take in a model and generate:
|
||||
- feature importance
|
||||
and save these outputs
|
||||
"""
|
||||
|
||||
logger.info("------------------------------------")
|
||||
logger.info(f"--- Generate Feature Importance ---")
|
||||
logger.info("------------------------------------")
|
||||
|
||||
test_df = dataclient.load_data(output_test_filepath)
|
||||
|
||||
test_df = test_df.head(permutation_subsample_amount)
|
||||
|
||||
feature_names = test_df.columns.to_list()
|
||||
feature_names.remove(target)
|
||||
|
||||
x = test_df[feature_names].to_numpy()
|
||||
y = test_df[target].to_numpy()
|
||||
|
||||
def predict_fn(X: np.ndarray) -> np.ndarray:
|
||||
return model.predict(pd.DataFrame(X, columns=feature_names))
|
||||
|
||||
pfi = PermutationImportance(
|
||||
predictor=predict_fn,
|
||||
loss_fns=loss_fns,
|
||||
feature_names=feature_names,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Permutation feature importance - using {permutation_subsample_amount} samples and {n_repeats} shuffles per feature:"
|
||||
)
|
||||
|
||||
exp = pfi.explain(x, y, n_repeats=n_repeats)
|
||||
|
||||
mean_value_feature_importance = [
|
||||
element["mean"] for element in exp.data["feature_importance"][0]
|
||||
]
|
||||
feature_importance_df = pd.DataFrame(
|
||||
mean_value_feature_importance,
|
||||
index=exp.data["feature_names"],
|
||||
columns=[feature_importance_column],
|
||||
).sort_values(feature_importance_column, ascending=False)
|
||||
|
||||
plot_permutation_importance(
|
||||
exp, fig_kw={"figwidth": figwidth, "figheight": figheight}
|
||||
)
|
||||
|
||||
logger.info("--------------------------------------")
|
||||
logger.info(f"--- Save Feature Importance table ---")
|
||||
logger.info("--------------------------------------")
|
||||
|
||||
dataclient.save_data(feature_importance_df, location=feature_importance_filepath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- {__file__} - Start! ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
model_analysis(
|
||||
model=model,
|
||||
dataclient=dataclient,
|
||||
target=target,
|
||||
output_test_filepath=output_test_filepath,
|
||||
feature_importance_filepath=feature_importance_filepath,
|
||||
permutation_subsample_amount=permutation_subsample_amount,
|
||||
loss_fns=loss_fns,
|
||||
feature_importance_column=feature_importance_column,
|
||||
n_repeats=n_repeats,
|
||||
figwidth=figwidth,
|
||||
figheight=figheight,
|
||||
)
|
||||
|
||||
logger.info("-------------------------------")
|
||||
logger.info(f"--- {__file__} - Complete! ---")
|
||||
logger.info("-------------------------------")
|
||||
111
modules/ml-pipeline/src/pipeline/prediction_analysis.py
Normal file
111
modules/ml-pipeline/src/pipeline/prediction_analysis.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
"""
|
||||
Look at why the model made such a prediction
|
||||
Manual script to run
|
||||
Workflow:
|
||||
- Identify a prediction row/s that you wish to look into
|
||||
- i.e. a bad prediction/s
|
||||
- Add these rows to the config
|
||||
- Run script
|
||||
"""
|
||||
|
||||
import shap
|
||||
|
||||
shap.initjs()
|
||||
|
||||
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from core.interface.InterfaceModels import MLModel
|
||||
from core.interface.InterfaceDataClient import DataClient
|
||||
from core.Logger import logger
|
||||
from core.MLModels import model_factory
|
||||
from core.DataClient import dataclient_factory
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
|
||||
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
|
||||
build_model_params = yaml.safe_load(open(build_model_path))
|
||||
|
||||
prediction_analysis_path = (
|
||||
Path(__file__).parent / "configs" / "prediction_analysis.yaml"
|
||||
)
|
||||
prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path))
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
model.load_model(build_model_params["model_save_filepath"])
|
||||
|
||||
dataclient_type = prediction_analysis_params["dataclient_type"]
|
||||
dataclient = dataclient_factory(
|
||||
dataclient_type=dataclient_type,
|
||||
dataclient_config=client_params[dataclient_type],
|
||||
)
|
||||
|
||||
output_test_filepath = prepare_data_params["output_test_filepath"]
|
||||
|
||||
|
||||
def prediction_analysis(
|
||||
model: MLModel, dataclient: DataClient, output_test_filepath: str
|
||||
):
|
||||
|
||||
test_df = dataclient.load_data(output_test_filepath)
|
||||
target = "SAP_ENDING"
|
||||
test_df_without_target = test_df.drop(columns=[target])
|
||||
|
||||
# test_df_summary = shap.kmeans(test_df, 10)
|
||||
# print("Baseline feature-values: \n", test_df_summary)
|
||||
class AutogluonWrapper:
|
||||
def __init__(self, predictor, feature_names):
|
||||
self.ag_model = predictor
|
||||
self.feature_names = feature_names
|
||||
|
||||
def predict(self, X):
|
||||
if isinstance(X, pd.Series):
|
||||
X = X.values.reshape(1, -1)
|
||||
if not isinstance(X, pd.DataFrame):
|
||||
X = pd.DataFrame(X, columns=self.feature_names)
|
||||
return self.ag_model.predict(X)
|
||||
|
||||
ag_wrapper = AutogluonWrapper(
|
||||
model.model, feature_names=test_df_without_target.columns
|
||||
)
|
||||
explainer = shap.KernelExplainer(ag_wrapper.predict, test_df_without_target)
|
||||
|
||||
NSHAP_SAMPLES = 100 # how many samples to use to approximate each Shapely value, larger values will be slower
|
||||
N_VAL = 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
|
||||
|
||||
ROW_INDEX = 0 # index of an example datapoint
|
||||
single_datapoint = test_df_without_target.iloc[[ROW_INDEX]]
|
||||
single_prediction = ag_wrapper.predict(single_datapoint)
|
||||
|
||||
shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES)
|
||||
shap.force_plot(
|
||||
explainer.expected_value,
|
||||
shap_values_single,
|
||||
test_df_without_target.iloc[ROW_INDEX, :],
|
||||
)
|
||||
...
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- {__file__} - Start! ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
prediction_analysis(
|
||||
model=model, dataclient=dataclient, output_test_filepath=output_test_filepath
|
||||
)
|
||||
|
||||
logger.info("-------------------------------")
|
||||
logger.info(f"--- {__file__} - Complete! ---")
|
||||
logger.info("-------------------------------")
|
||||
|
|
@ -74,6 +74,9 @@ def prepare_data(
|
|||
train, test = train_test_split(
|
||||
data, train_size=train_proportion, test_size=(1 - train_proportion)
|
||||
)
|
||||
test = test.reset_index(drop=True)
|
||||
|
||||
train = train.reset_index(drop=True)
|
||||
|
||||
logger.info("-----------------------")
|
||||
logger.info("--- Outputting data ---")
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ joblib==1.3.2
|
|||
boto3==1.28.17
|
||||
pandas==1.5.3
|
||||
autogluon==0.8.2
|
||||
alibi==0.9.4
|
||||
pyarrow==13.0.0
|
||||
pre-commit==3.3.3
|
||||
sphinx==7.2.5
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue