use dynaconf to simplify configs

This commit is contained in:
Michael Duong 2023-09-29 11:37:36 +00:00
parent 44d0e145f6
commit ba592b36b7
25 changed files with 255 additions and 304 deletions

View file

@ -0,0 +1,3 @@
# Ignore dynaconf secret files
.secrets.*

View file

@ -6,9 +6,9 @@ import shutil
import yaml
from pathlib import Path
from core.Logger import logger
from config import settings
startup_cleanup_path = Path(__file__).parent / "configs" / "startup_cleanup.yaml"
startup_cleanup_params = yaml.safe_load(open(startup_cleanup_path))
startup_cleanup_params = settings.startup_cleanup
def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:

View file

@ -15,6 +15,7 @@ from configs.feature_processor_logic import business_logic, new_feature_funcs
from core.Logger import logger
from core.DataClient import dataclient_factory
from core.FeatureProcessor import feature_processor_factory
from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---")
@ -22,14 +23,9 @@ logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
client_params = settings.client
prepare_data_params = settings.prepare_data
feature_process_params = settings.feature_processor
data_filepath = prepare_data_params["data_filepath"]
train_proportion = prepare_data_params["train_proportion"]

View file

@ -16,6 +16,7 @@ from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.MLMetrics import metrics_factory
from configs.post_prediction_logic import post_prediction_logic
from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---")
@ -23,17 +24,10 @@ logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
feature_process_params = settings.feature_processor
generate_metrics_params = settings.generate_metrics
model_type = build_model_params["model_type"]
target = feature_process_params["feature_processor_config"]["target"]
@ -149,8 +143,8 @@ if __name__ == "__main__":
metrics=metrics,
target=target,
model_save_location=model_save_location,
model_hyperparameters=build_model_params[model_type],
train_filepath=model_hyperparameters,
model_hyperparameters=model_hyperparameters,
train_filepath=train_filepath,
test_filepath=test_filepath,
fit_metrics_filepath=fit_metrics_filepath,
)

View file

@ -13,7 +13,7 @@ from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.Logger import logger
from configs.post_prediction_logic import post_prediction_logic
from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---")
@ -21,26 +21,20 @@ logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
client_params = settings.client
prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
feature_process_params = settings.feature_processor
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
input_dataclient_type = generate_predictions_params["input_dataclient_type"]
output_dataclient_type = generate_predictions_params["output_dataclient_type"]
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
test_data_filepath = generate_predictions_params["test_data_filepath"]
test_data_filepath = os.environ.get("PREDICTION_FILE", test_data_filepath)
target = feature_process_params["feature_processor_config"]["target"]
model_filepath = build_model_params["model_save_filepath"]
test_data_filepath = generate_predictions_params["test_data_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
@ -57,13 +51,11 @@ logger.info("----------------------------")
# We may have different locations of loading hence why we use one specified in generate_predictions.yaml
# I.e. for metric runs, this will be a local data client
# For predictions, we will want a cloud data client
input_dataclient_type = generate_predictions_params["input_dataclient_type"]
input_dataclient = dataclient_factory(
dataclient_type=input_dataclient_type,
dataclient_config=client_params[input_dataclient_type],
)
output_dataclient_type = generate_predictions_params["output_dataclient_type"]
output_dataclient = dataclient_factory(
dataclient_type=output_dataclient_type,
dataclient_config=client_params[output_dataclient_type],

View file

@ -14,7 +14,7 @@ from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.MLMetrics import metrics_factory
from core.Logger import logger
from config import settings
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---")
@ -22,25 +22,12 @@ logger.info("----------------------------")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
client_params = settings.client
prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
generate_metrics_params = settings.generate_metrics
feature_process_params = settings.feature_processor
target = feature_process_params["feature_processor_config"]["target"]
test_data_filepath = generate_predictions_params["test_data_filepath"]

View file

@ -0,0 +1,14 @@
from dynaconf import Dynaconf
settings = Dynaconf(
environments=True,
envvar_prefix="DYNACONF",
settings_files=[
"./configs/settings.yaml",
"./configs/build_model.yaml",
"./configs/analysis.yaml",
],
)
# `envvar_prefix` = export envvars with `export DYNACONF_FOO=bar`.
# `settings_files` = Load these files in the order.

View file

@ -0,0 +1,16 @@
default:
model_analysis:
dataclient_type: local
feature_importance_filepath: ./analysis/feature_importance.parquet
permutation_subsample_amount: 1000
loss_fns: "mean_absolute_percentage_error"
feature_importance_column: importance
n_repeats: 5
figwidth: 7
figheight: 6
prediction_analysis:
dataclient_type: local
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
row_index: [0, 10, 20] # index of an example datapoint

View file

@ -1,16 +1,18 @@
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
fit_metrics_filepath: ./metrics/fit_metrics.json
default:
build_model:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
fit_metrics_filepath: ./metrics/fit_metrics.json
SKLearnLinearRegression: null
SKLearnLinearRegression: null
SKLearnSVMRegression:
kernel: "linear"
SKLearnSVMRegression:
kernel: "linear"
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 800
presets: medium_quality
excluded_model_types: ['KNN']
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 75
presets: medium_quality
excluded_model_types: ['KNN']

View file

@ -1,10 +0,0 @@
aws-s3:
AWS_ACCESS_KEY_ID: null
AWS_SECRET_ACCESS_KEY: null
ENDPOINT_URL: null
aws-s3-mock:
AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
ENDPOINT_URL: http://localhost:9000
local:
null

View file

@ -1,3 +0,0 @@
"""
Stitch all yaml configuration files together, override some settings (such as bucket location) based off environment variables
"""

View file

@ -1,61 +0,0 @@
feature_processor_type: dataframe
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: SAP_ENDING
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
# retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
# retain_features: null
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
# 'NUMBER_HEATED_ROOMS',
# 'FIXED_LIGHTING_OUTLETS_COUNT',
# 'CONSTRUCTION_AGE_BAND',
# 'TRANSACTION_TYPE_STARTING',
# 'LIGHTING_DESCRIPTION_STARTING',
# 'MAINHEAT_DESCRIPTION_STARTING',
# 'HOTWATER_DESCRIPTION_STARTING',
# 'MAIN_FUEL_STARTING',
# 'MECHANICAL_VENTILATION_STARTING',
# 'SECONDHEAT_DESCRIPTION_STARTING',
# 'ENERGY_TARIFF_STARTING',
# 'SOLAR_WATER_HEATING_FLAG_STARTING',
# 'PHOTO_SUPPLY_STARTING',
# 'WINDOWS_DESCRIPTION_STARTING',
# 'GLAZED_TYPE_STARTING',
# 'MULTI_GLAZE_PROPORTION_STARTING',
# 'LOW_ENERGY_LIGHTING_STARTING',
# 'NUMBER_OPEN_FIREPLACES_STARTING',
# 'MAINHEATCONT_DESCRIPTION_STARTING',
# 'EXTENSION_COUNT_STARTING',
# 'TOTAL_FLOOR_AREA_STARTING',
# 'FLOOR_HEIGHT_STARTING',
# 'DAYS_TO_STARTING',
# 'WALLS_DESCRIPTION_STARTING',
# 'FLOOR_DESCRIPTION_STARTING']
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
# 'NUMBER_HEATED_ROOMS',
# 'FIXED_LIGHTING_OUTLETS_COUNT',
# 'CONSTRUCTION_AGE_BAND',
# 'TRANSACTION_TYPE_ENDING',
# 'LIGHTING_DESCRIPTION_ENDING',
# 'MAINHEAT_DESCRIPTION_ENDING',
# 'HOTWATER_DESCRIPTION_ENDING',
# 'MAIN_FUEL_ENDING',
# 'MECHANICAL_VENTILATION_ENDING',
# 'SECONDHEAT_DESCRIPTION_ENDING',
# 'ENERGY_TARIFF_ENDING',
# 'SOLAR_WATER_HEATING_FLAG_ENDING',
# 'PHOTO_SUPPLY_ENDING',
# 'WINDOWS_DESCRIPTION_ENDING',
# 'GLAZED_TYPE_ENDING',
# 'MULTI_GLAZE_PROPORTION_ENDING',
# 'LOW_ENERGY_LIGHTING_ENDING',
# 'NUMBER_OPEN_FIREPLACES_ENDING',
# 'MAINHEATCONT_DESCRIPTION_ENDING',
# 'EXTENSION_COUNT_ENDING',
# 'TOTAL_FLOOR_AREA_ENDING',
# 'FLOOR_HEIGHT_ENDING',
# 'DAYS_TO_ENDING',
# 'WALLS_DESCRIPTION_ENDING',
# 'FLOOR_DESCRIPTION_ENDING']
retain_features: null

View file

@ -1,3 +0,0 @@
dataclient_type: local
metrics_type: Regression
metrics_output_filepath: ./metrics/metrics.json

View file

@ -1,5 +0,0 @@
input_dataclient_type: local
output_dataclient_type: local
test_data_filepath: ./data/prepared_data/test.parquet
predictions_output_filepath: ./data/predictions/predictions.parquet
predictions_column_name: predictions

View file

@ -1,8 +0,0 @@
dataclient_type: local
feature_importance_filepath: ./analysis/feature_importance.parquet
permutation_subsample_amount: 1000
loss_fns: "mean_absolute_percentage_error"
feature_importance_column: importance
n_repeats: 5
figwidth: 7
figheight: 6

View file

@ -11,6 +11,7 @@ def clip_predictions_to_minimum_value(
series_name = predictions.name
predictions.name = "predictions"
predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement
replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value

View file

@ -1,4 +0,0 @@
dataclient_type: local
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
row_index: [0, 10, 20] # index of an example datapoint

View file

@ -1,9 +0,0 @@
input_dataclient_type: aws-s3
output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
# cache_o

View file

@ -0,0 +1,49 @@
default:
startup_cleanup:
artefacts: ./data
metrics: ./metrics
client:
aws-s3:
AWS_ACCESS_KEY_ID: null # Use local credentials
AWS_SECRET_ACCESS_KEY: null # Use local credentials
ENDPOINT_URL: null # Use local credentials
aws-s3-mock:
AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
ENDPOINT_URL: http://localhost:9000
local:
null
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
feature_processor:
feature_processor_type: dataframe
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: SAP_ENDING
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
retain_features: null
generate_predictions:
input_dataclient_type: local
output_dataclient_type: local
test_data_filepath: ./data/prepared_data/test.parquet
predictions_output_filepath: ./data/predictions/predictions.parquet
predictions_column_name: predictions
generate_metrics:
dataclient_type: local
metrics_type: Regression
metrics_output_filepath: ./metrics/metrics.json
dev:
generate_predictions:
input_dataclient_type: aws-s3

View file

@ -1,2 +0,0 @@
artefacts: ./data
metrics: ./metrics

View file

@ -5,122 +5,139 @@ stages:
deps:
- path: 1_prepare_data.py
hash: md5
md5: 2648d7d407dca857a1d20a11a88d3d98
size: 5116
md5: c9f030df733e318b80d1fa91b7732f79
size: 5132
params:
configs/prepare_data.yaml:
output_test_filepath: ./data/prepared_data/test.parquet
output_train_filepath: ./data/prepared_data/train.parquet
train_proportion: 0.9
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
- UPRN
- HEAT_DEMAND_CHANGE
- CARBON_CHANGE
- RDSAP_CHANGE
- HEAT_DEMAND_ENDING
- CARBON_ENDING
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: SAP_ENDING
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 0.9
outs:
- path: data/prepared_data/
hash: md5
md5: 7bcbf81a82015276e25749d1bc249a57.dir
size: 21076961
md5: ed19a11a85d6a2006631173f51569d27.dir
size: 21131576
nfiles: 2
build_model:
cmd: python 2_build_model.py
deps:
- path: 2_build_model.py
hash: md5
md5: 3eb1a5110df6e25a23d8e8a92bb27823
size: 5257
md5: 039578b629d7cd204016e92cd079ea90
size: 5181
- path: data/prepared_data
hash: md5
md5: 7bcbf81a82015276e25749d1bc249a57.dir
size: 21076961
md5: ed19a11a85d6a2006631173f51569d27.dir
size: 21131576
nfiles: 2
params:
configs/build_model.yaml:
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 800
presets: medium_quality
excluded_model_types:
- KNN
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
fit_metrics_filepath: ./metrics/fit_metrics.json
model_save_filepath: ./data/model/autogluonmodel/
model_type: AutogluonAutoML
default:
build_model:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
fit_metrics_filepath: ./metrics/fit_metrics.json
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 75
presets: medium_quality
excluded_model_types:
- KNN
outs:
- path: data/model/
hash: md5
md5: 397c46c062b51034b6f8f3f229345de3.dir
size: 334481421
nfiles: 18
md5: 60e253c42cc36934098c627ef3ef4cc1.dir
size: 185134993
nfiles: 14
- path: metrics/fit_metrics.json
hash: md5
md5: f6e7e21d4229d4a229ea0a11f3023637
size: 184
md5: a0c2a1c9e5da0b857d510fa1ba6282a8
size: 186
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
- path: data/model
hash: md5
md5: 397c46c062b51034b6f8f3f229345de3.dir
size: 334481421
nfiles: 18
- path: data/prepared_data
hash: md5
md5: 7bcbf81a82015276e25749d1bc249a57.dir
size: 21076961
nfiles: 2
- path: 3_generate_predictions.py
hash: md5
md5: 874da2443ef0d92731e4c127f3ce4acb
size: 4434
md5: 238b3fa9f3c6f3720e77c116857070ae
size: 4720
- path: data/model
hash: md5
md5: 60e253c42cc36934098c627ef3ef4cc1.dir
size: 185134993
nfiles: 14
- path: data/prepared_data
hash: md5
md5: ed19a11a85d6a2006631173f51569d27.dir
size: 21131576
nfiles: 2
params:
configs/generate_predictions.yaml:
input_dataclient_type: local
output_dataclient_type: local
predictions_column_name: predictions
predictions_output_filepath: ./data/predictions/predictions.parquet
test_data_filepath: ./data/prepared_data/test.parquet
configs/settings.yaml:
default.generate_predictions.input_dataclient_type: local
default.generate_predictions.output_dataclient_type: local
default.generate_predictions.predictions_column_name: predictions
default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet
default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet
outs:
- path: data/predictions/
hash: md5
md5: 9c18005e722f0e428f4b83c3f974f206.dir
size: 381870
md5: 700c8767de3a86c4c5339daf3cc17192.dir
size: 380962
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
deps:
- path: 4_generate_metrics.py
hash: md5
md5: 2c9fb78955a8c19cff0a098976f81d1b
size: 4487
- path: data/predictions
hash: md5
md5: 9c18005e722f0e428f4b83c3f974f206.dir
size: 381870
md5: 700c8767de3a86c4c5339daf3cc17192.dir
size: 380962
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 7bcbf81a82015276e25749d1bc249a57.dir
size: 21076961
md5: ed19a11a85d6a2006631173f51569d27.dir
size: 21131576
nfiles: 2
- path: 4_generate_metrics.py
hash: md5
md5: 8ce0b6b55e1688fca816985e0cf37f28
size: 4220
params:
configs/generate_metrics.yaml:
dataclient_type: local
metrics_output_filepath: ./metrics/metrics.json
metrics_type: Regression
configs/settings.yaml:
default.generate_metrics.dataclient_type: local
default.generate_metrics.metrics_output_filepath: ./metrics/metrics.json
default.generate_metrics.metrics_type: Regression
outs:
- path: metrics/metrics.json
hash: md5
md5: 93d9b69d6cd951ae2c14b29ba92a2a38
size: 186
md5: 45ffac8f6e7283df4b69af8a9abc45e1
size: 184
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: 2e51fbcac960d0f960bf32a8ec7486a0
size: 1748
md5: fbb7e3b1b98b517c870f3e1df3e7f695
size: 1676
params:
configs/startup_cleanup.yaml:
artefacts: ./data
metrics: ./metrics
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics

View file

@ -4,19 +4,28 @@ stages:
deps:
- 0_startup_cleanup.py
params:
- configs/startup_cleanup.yaml:
- artefacts
- metrics
- configs/settings.yaml:
- default.startup_cleanup.artefacts
- default.startup_cleanup.metrics
always_changed: true
prepare_data:
cmd: python 1_prepare_data.py
deps:
- 1_prepare_data.py
params:
- configs/prepare_data.yaml:
- output_test_filepath
- output_train_filepath
- train_proportion
- configs/settings.yaml:
- default.prepare_data.input_dataclient_type
- default.prepare_data.output_dataclient_type
- default.prepare_data.data_filepath
- default.prepare_data.train_proportion
- default.prepare_data.output_train_filepath
- default.prepare_data.output_test_filepath
- default.feature_processor.feature_processor_type
- default.feature_processor.feature_processor_config.subsample_amount
- default.feature_processor.feature_processor_config.subsample_seed
- default.feature_processor.feature_processor_config.target
- default.feature_processor.feature_processor_config.drop_columns
- default.feature_processor.feature_processor_config.retain_features
outs:
- data/prepared_data/
always_changed: true
@ -38,7 +47,12 @@ stages:
- data/prepared_data
- data/model
params:
- configs/generate_predictions.yaml:
- configs/settings.yaml:
- default.generate_predictions.input_dataclient_type
- default.generate_predictions.output_dataclient_type
- default.generate_predictions.test_data_filepath
- default.generate_predictions.predictions_output_filepath
- default.generate_predictions.predictions_column_name
outs:
- data/predictions/
always_changed: true
@ -49,7 +63,10 @@ stages:
- data/prepared_data
- data/predictions
params:
- configs/generate_metrics.yaml:
- configs/settings.yaml:
- default.generate_metrics.dataclient_type
- default.generate_metrics.metrics_type
- default.generate_metrics.metrics_output_filepath
outs:
- metrics/metrics.json
always_changed: true

View file

@ -3,8 +3,6 @@ Post Model generation step:
We want to look at feature analysis of the model
"""
import yaml
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.Logger import logger
@ -13,27 +11,16 @@ from core.DataClient import dataclient_factory
from alibi.explainers import PermutationImportance, plot_permutation_importance
import numpy as np
import pandas as pd
from config import settings
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
client_params = settings.client
prepare_data_params = settings.prepare_data
feature_process_params = settings.feature_processor
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
model_analysis_params = yaml.safe_load(open(model_analysis_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
model_analysis_params = settings.model_analysis
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])

View file

@ -12,40 +12,21 @@ import shap
shap.initjs()
import yaml
from typing import List
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.Logger import logger
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import numpy as np
import pandas as pd
from config import settings
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
feature_process_params = yaml.safe_load(open(feature_process_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
prediction_analysis_path = (
Path(__file__).parent / "configs" / "prediction_analysis.yaml"
)
prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path))
client_params = settings.client
prepare_data_params = settings.prepare_data
feature_process_params = settings.feature_processor
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
prediction_analysis_params = settings.prediction_analysis
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])