mirror of
https://github.com/Hestia-Homes/ML.git
synced 2026-06-08 11:17:25 +00:00
use dynaconf to simplify configs
This commit is contained in:
parent
44d0e145f6
commit
ba592b36b7
25 changed files with 255 additions and 304 deletions
3
modules/ml-pipeline/src/pipeline/.gitignore
vendored
Normal file
3
modules/ml-pipeline/src/pipeline/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
|
||||
# Ignore dynaconf secret files
|
||||
.secrets.*
|
||||
|
|
@ -6,9 +6,9 @@ import shutil
|
|||
import yaml
|
||||
from pathlib import Path
|
||||
from core.Logger import logger
|
||||
from config import settings
|
||||
|
||||
startup_cleanup_path = Path(__file__).parent / "configs" / "startup_cleanup.yaml"
|
||||
startup_cleanup_params = yaml.safe_load(open(startup_cleanup_path))
|
||||
startup_cleanup_params = settings.startup_cleanup
|
||||
|
||||
|
||||
def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None:
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from configs.feature_processor_logic import business_logic, new_feature_funcs
|
|||
from core.Logger import logger
|
||||
from core.DataClient import dataclient_factory
|
||||
from core.FeatureProcessor import feature_processor_factory
|
||||
from config import settings
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
|
|
@ -22,14 +23,9 @@ logger.info("----------------------------")
|
|||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
client_params = settings.client
|
||||
prepare_data_params = settings.prepare_data
|
||||
feature_process_params = settings.feature_processor
|
||||
|
||||
data_filepath = prepare_data_params["data_filepath"]
|
||||
train_proportion = prepare_data_params["train_proportion"]
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ from core.DataClient import dataclient_factory
|
|||
from core.MLModels import model_factory
|
||||
from core.MLMetrics import metrics_factory
|
||||
from configs.post_prediction_logic import post_prediction_logic
|
||||
from config import settings
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
|
|
@ -23,17 +24,10 @@ logger.info("----------------------------")
|
|||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
|
||||
build_model_params = yaml.safe_load(open(build_model_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
|
||||
generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
|
||||
generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
|
||||
prepare_data_params = settings.prepare_data
|
||||
build_model_params = settings.build_model
|
||||
feature_process_params = settings.feature_processor
|
||||
generate_metrics_params = settings.generate_metrics
|
||||
|
||||
model_type = build_model_params["model_type"]
|
||||
target = feature_process_params["feature_processor_config"]["target"]
|
||||
|
|
@ -149,8 +143,8 @@ if __name__ == "__main__":
|
|||
metrics=metrics,
|
||||
target=target,
|
||||
model_save_location=model_save_location,
|
||||
model_hyperparameters=build_model_params[model_type],
|
||||
train_filepath=model_hyperparameters,
|
||||
model_hyperparameters=model_hyperparameters,
|
||||
train_filepath=train_filepath,
|
||||
test_filepath=test_filepath,
|
||||
fit_metrics_filepath=fit_metrics_filepath,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ from core.DataClient import dataclient_factory
|
|||
from core.MLModels import model_factory
|
||||
from core.Logger import logger
|
||||
from configs.post_prediction_logic import post_prediction_logic
|
||||
|
||||
from config import settings
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
|
|
@ -21,26 +21,20 @@ logger.info("----------------------------")
|
|||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
client_params = settings.client
|
||||
prepare_data_params = settings.prepare_data
|
||||
build_model_params = settings.build_model
|
||||
generate_predictions_params = settings.generate_predictions
|
||||
feature_process_params = settings.feature_processor
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
input_dataclient_type = generate_predictions_params["input_dataclient_type"]
|
||||
output_dataclient_type = generate_predictions_params["output_dataclient_type"]
|
||||
|
||||
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
|
||||
build_model_params = yaml.safe_load(open(build_model_path))
|
||||
|
||||
generate_predictions_path = (
|
||||
Path(__file__).parent / "configs" / "generate_predictions.yaml"
|
||||
)
|
||||
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
test_data_filepath = generate_predictions_params["test_data_filepath"]
|
||||
test_data_filepath = os.environ.get("PREDICTION_FILE", test_data_filepath)
|
||||
|
||||
target = feature_process_params["feature_processor_config"]["target"]
|
||||
model_filepath = build_model_params["model_save_filepath"]
|
||||
test_data_filepath = generate_predictions_params["test_data_filepath"]
|
||||
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
|
||||
predictions_column_name = generate_predictions_params["predictions_column_name"]
|
||||
|
||||
|
|
@ -57,13 +51,11 @@ logger.info("----------------------------")
|
|||
# We may have different locations of loading hence why we use one specified in generate_predictions.yaml
|
||||
# I.e. for metric runs, this will be a local data client
|
||||
# For predictions, we will want a cloud data client
|
||||
input_dataclient_type = generate_predictions_params["input_dataclient_type"]
|
||||
input_dataclient = dataclient_factory(
|
||||
dataclient_type=input_dataclient_type,
|
||||
dataclient_config=client_params[input_dataclient_type],
|
||||
)
|
||||
|
||||
output_dataclient_type = generate_predictions_params["output_dataclient_type"]
|
||||
output_dataclient = dataclient_factory(
|
||||
dataclient_type=output_dataclient_type,
|
||||
dataclient_config=client_params[output_dataclient_type],
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ from core.DataClient import dataclient_factory
|
|||
from core.MLModels import model_factory
|
||||
from core.MLMetrics import metrics_factory
|
||||
from core.Logger import logger
|
||||
|
||||
from config import settings
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
|
|
@ -22,25 +22,12 @@ logger.info("----------------------------")
|
|||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
|
||||
build_model_params = yaml.safe_load(open(build_model_path))
|
||||
|
||||
generate_predictions_path = (
|
||||
Path(__file__).parent / "configs" / "generate_predictions.yaml"
|
||||
)
|
||||
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
|
||||
|
||||
generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml"
|
||||
generate_metrics_params = yaml.safe_load(open(generate_metrics_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
client_params = settings.client
|
||||
prepare_data_params = settings.prepare_data
|
||||
build_model_params = settings.build_model
|
||||
generate_predictions_params = settings.generate_predictions
|
||||
generate_metrics_params = settings.generate_metrics
|
||||
feature_process_params = settings.feature_processor
|
||||
|
||||
target = feature_process_params["feature_processor_config"]["target"]
|
||||
test_data_filepath = generate_predictions_params["test_data_filepath"]
|
||||
|
|
|
|||
Binary file not shown.
14
modules/ml-pipeline/src/pipeline/config.py
Normal file
14
modules/ml-pipeline/src/pipeline/config.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from dynaconf import Dynaconf
|
||||
|
||||
settings = Dynaconf(
|
||||
environments=True,
|
||||
envvar_prefix="DYNACONF",
|
||||
settings_files=[
|
||||
"./configs/settings.yaml",
|
||||
"./configs/build_model.yaml",
|
||||
"./configs/analysis.yaml",
|
||||
],
|
||||
)
|
||||
|
||||
# `envvar_prefix` = export envvars with `export DYNACONF_FOO=bar`.
|
||||
# `settings_files` = Load these files in the order.
|
||||
16
modules/ml-pipeline/src/pipeline/configs/analysis.yaml
Normal file
16
modules/ml-pipeline/src/pipeline/configs/analysis.yaml
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
default:
|
||||
model_analysis:
|
||||
dataclient_type: local
|
||||
feature_importance_filepath: ./analysis/feature_importance.parquet
|
||||
permutation_subsample_amount: 1000
|
||||
loss_fns: "mean_absolute_percentage_error"
|
||||
feature_importance_column: importance
|
||||
n_repeats: 5
|
||||
figwidth: 7
|
||||
figheight: 6
|
||||
|
||||
prediction_analysis:
|
||||
dataclient_type: local
|
||||
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
|
||||
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
|
||||
row_index: [0, 10, 20] # index of an example datapoint
|
||||
|
|
@ -1,16 +1,18 @@
|
|||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
fit_metrics_filepath: ./metrics/fit_metrics.json
|
||||
default:
|
||||
build_model:
|
||||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
fit_metrics_filepath: ./metrics/fit_metrics.json
|
||||
|
||||
SKLearnLinearRegression: null
|
||||
SKLearnLinearRegression: null
|
||||
|
||||
SKLearnSVMRegression:
|
||||
kernel: "linear"
|
||||
SKLearnSVMRegression:
|
||||
kernel: "linear"
|
||||
|
||||
AutogluonAutoML:
|
||||
output_filepath: ./data/model/autogluonmodel/
|
||||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 800
|
||||
presets: medium_quality
|
||||
excluded_model_types: ['KNN']
|
||||
AutogluonAutoML:
|
||||
output_filepath: ./data/model/autogluonmodel/
|
||||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 75
|
||||
presets: medium_quality
|
||||
excluded_model_types: ['KNN']
|
||||
|
|
|
|||
|
|
@ -1,10 +0,0 @@
|
|||
aws-s3:
|
||||
AWS_ACCESS_KEY_ID: null
|
||||
AWS_SECRET_ACCESS_KEY: null
|
||||
ENDPOINT_URL: null
|
||||
aws-s3-mock:
|
||||
AWS_ACCESS_KEY_ID: minio
|
||||
AWS_SECRET_ACCESS_KEY: minio123
|
||||
ENDPOINT_URL: http://localhost:9000
|
||||
local:
|
||||
null
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
"""
|
||||
Stitch all yaml configuration files together, override some settings (such as bucket location) based off environment variables
|
||||
"""
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
feature_processor_type: dataframe
|
||||
feature_processor_config:
|
||||
subsample_amount: null
|
||||
subsample_seed: 0
|
||||
target: SAP_ENDING
|
||||
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
|
||||
# retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
|
||||
# retain_features: null
|
||||
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
|
||||
# 'NUMBER_HEATED_ROOMS',
|
||||
# 'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
# 'CONSTRUCTION_AGE_BAND',
|
||||
# 'TRANSACTION_TYPE_STARTING',
|
||||
# 'LIGHTING_DESCRIPTION_STARTING',
|
||||
# 'MAINHEAT_DESCRIPTION_STARTING',
|
||||
# 'HOTWATER_DESCRIPTION_STARTING',
|
||||
# 'MAIN_FUEL_STARTING',
|
||||
# 'MECHANICAL_VENTILATION_STARTING',
|
||||
# 'SECONDHEAT_DESCRIPTION_STARTING',
|
||||
# 'ENERGY_TARIFF_STARTING',
|
||||
# 'SOLAR_WATER_HEATING_FLAG_STARTING',
|
||||
# 'PHOTO_SUPPLY_STARTING',
|
||||
# 'WINDOWS_DESCRIPTION_STARTING',
|
||||
# 'GLAZED_TYPE_STARTING',
|
||||
# 'MULTI_GLAZE_PROPORTION_STARTING',
|
||||
# 'LOW_ENERGY_LIGHTING_STARTING',
|
||||
# 'NUMBER_OPEN_FIREPLACES_STARTING',
|
||||
# 'MAINHEATCONT_DESCRIPTION_STARTING',
|
||||
# 'EXTENSION_COUNT_STARTING',
|
||||
# 'TOTAL_FLOOR_AREA_STARTING',
|
||||
# 'FLOOR_HEIGHT_STARTING',
|
||||
# 'DAYS_TO_STARTING',
|
||||
# 'WALLS_DESCRIPTION_STARTING',
|
||||
# 'FLOOR_DESCRIPTION_STARTING']
|
||||
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
|
||||
# 'NUMBER_HEATED_ROOMS',
|
||||
# 'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
# 'CONSTRUCTION_AGE_BAND',
|
||||
# 'TRANSACTION_TYPE_ENDING',
|
||||
# 'LIGHTING_DESCRIPTION_ENDING',
|
||||
# 'MAINHEAT_DESCRIPTION_ENDING',
|
||||
# 'HOTWATER_DESCRIPTION_ENDING',
|
||||
# 'MAIN_FUEL_ENDING',
|
||||
# 'MECHANICAL_VENTILATION_ENDING',
|
||||
# 'SECONDHEAT_DESCRIPTION_ENDING',
|
||||
# 'ENERGY_TARIFF_ENDING',
|
||||
# 'SOLAR_WATER_HEATING_FLAG_ENDING',
|
||||
# 'PHOTO_SUPPLY_ENDING',
|
||||
# 'WINDOWS_DESCRIPTION_ENDING',
|
||||
# 'GLAZED_TYPE_ENDING',
|
||||
# 'MULTI_GLAZE_PROPORTION_ENDING',
|
||||
# 'LOW_ENERGY_LIGHTING_ENDING',
|
||||
# 'NUMBER_OPEN_FIREPLACES_ENDING',
|
||||
# 'MAINHEATCONT_DESCRIPTION_ENDING',
|
||||
# 'EXTENSION_COUNT_ENDING',
|
||||
# 'TOTAL_FLOOR_AREA_ENDING',
|
||||
# 'FLOOR_HEIGHT_ENDING',
|
||||
# 'DAYS_TO_ENDING',
|
||||
# 'WALLS_DESCRIPTION_ENDING',
|
||||
# 'FLOOR_DESCRIPTION_ENDING']
|
||||
retain_features: null
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
dataclient_type: local
|
||||
metrics_type: Regression
|
||||
metrics_output_filepath: ./metrics/metrics.json
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
input_dataclient_type: local
|
||||
output_dataclient_type: local
|
||||
test_data_filepath: ./data/prepared_data/test.parquet
|
||||
predictions_output_filepath: ./data/predictions/predictions.parquet
|
||||
predictions_column_name: predictions
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
dataclient_type: local
|
||||
feature_importance_filepath: ./analysis/feature_importance.parquet
|
||||
permutation_subsample_amount: 1000
|
||||
loss_fns: "mean_absolute_percentage_error"
|
||||
feature_importance_column: importance
|
||||
n_repeats: 5
|
||||
figwidth: 7
|
||||
figheight: 6
|
||||
|
|
@ -11,6 +11,7 @@ def clip_predictions_to_minimum_value(
|
|||
series_name = predictions.name
|
||||
predictions.name = "predictions"
|
||||
predictions_df = pd.concat([data, predictions], axis=1)
|
||||
# We expect all prediction to be atleast one point improvement
|
||||
replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
|
||||
predictions_df.loc[replace_index, "predictions"] = (
|
||||
predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
dataclient_type: local
|
||||
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
|
||||
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
|
||||
row_index: [0, 10, 20] # index of an example datapoint
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
input_dataclient_type: aws-s3
|
||||
output_dataclient_type: local
|
||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
|
||||
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
||||
train_proportion: 0.9
|
||||
output_train_filepath: ./data/prepared_data/train.parquet
|
||||
output_test_filepath: ./data/prepared_data/test.parquet
|
||||
|
||||
# cache_o
|
||||
49
modules/ml-pipeline/src/pipeline/configs/settings.yaml
Normal file
49
modules/ml-pipeline/src/pipeline/configs/settings.yaml
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
default:
|
||||
startup_cleanup:
|
||||
artefacts: ./data
|
||||
metrics: ./metrics
|
||||
|
||||
client:
|
||||
aws-s3:
|
||||
AWS_ACCESS_KEY_ID: null # Use local credentials
|
||||
AWS_SECRET_ACCESS_KEY: null # Use local credentials
|
||||
ENDPOINT_URL: null # Use local credentials
|
||||
aws-s3-mock:
|
||||
AWS_ACCESS_KEY_ID: minio
|
||||
AWS_SECRET_ACCESS_KEY: minio123
|
||||
ENDPOINT_URL: http://localhost:9000
|
||||
local:
|
||||
null
|
||||
|
||||
prepare_data:
|
||||
input_dataclient_type: aws-s3
|
||||
output_dataclient_type: local
|
||||
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
||||
train_proportion: 0.9
|
||||
output_train_filepath: ./data/prepared_data/train.parquet
|
||||
output_test_filepath: ./data/prepared_data/test.parquet
|
||||
|
||||
feature_processor:
|
||||
feature_processor_type: dataframe
|
||||
feature_processor_config:
|
||||
subsample_amount: null
|
||||
subsample_seed: 0
|
||||
target: SAP_ENDING
|
||||
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
|
||||
retain_features: null
|
||||
|
||||
generate_predictions:
|
||||
input_dataclient_type: local
|
||||
output_dataclient_type: local
|
||||
test_data_filepath: ./data/prepared_data/test.parquet
|
||||
predictions_output_filepath: ./data/predictions/predictions.parquet
|
||||
predictions_column_name: predictions
|
||||
|
||||
generate_metrics:
|
||||
dataclient_type: local
|
||||
metrics_type: Regression
|
||||
metrics_output_filepath: ./metrics/metrics.json
|
||||
|
||||
dev:
|
||||
generate_predictions:
|
||||
input_dataclient_type: aws-s3
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
artefacts: ./data
|
||||
metrics: ./metrics
|
||||
|
|
@ -5,122 +5,139 @@ stages:
|
|||
deps:
|
||||
- path: 1_prepare_data.py
|
||||
hash: md5
|
||||
md5: 2648d7d407dca857a1d20a11a88d3d98
|
||||
size: 5116
|
||||
md5: c9f030df733e318b80d1fa91b7732f79
|
||||
size: 5132
|
||||
params:
|
||||
configs/prepare_data.yaml:
|
||||
output_test_filepath: ./data/prepared_data/test.parquet
|
||||
output_train_filepath: ./data/prepared_data/train.parquet
|
||||
train_proportion: 0.9
|
||||
configs/settings.yaml:
|
||||
default.feature_processor.feature_processor_config.drop_columns:
|
||||
- UPRN
|
||||
- HEAT_DEMAND_CHANGE
|
||||
- CARBON_CHANGE
|
||||
- RDSAP_CHANGE
|
||||
- HEAT_DEMAND_ENDING
|
||||
- CARBON_ENDING
|
||||
default.feature_processor.feature_processor_config.retain_features:
|
||||
default.feature_processor.feature_processor_config.subsample_amount:
|
||||
default.feature_processor.feature_processor_config.subsample_seed: 0
|
||||
default.feature_processor.feature_processor_config.target: SAP_ENDING
|
||||
default.feature_processor.feature_processor_type: dataframe
|
||||
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
||||
default.prepare_data.input_dataclient_type: aws-s3
|
||||
default.prepare_data.output_dataclient_type: local
|
||||
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
|
||||
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
|
||||
default.prepare_data.train_proportion: 0.9
|
||||
outs:
|
||||
- path: data/prepared_data/
|
||||
hash: md5
|
||||
md5: 7bcbf81a82015276e25749d1bc249a57.dir
|
||||
size: 21076961
|
||||
md5: ed19a11a85d6a2006631173f51569d27.dir
|
||||
size: 21131576
|
||||
nfiles: 2
|
||||
build_model:
|
||||
cmd: python 2_build_model.py
|
||||
deps:
|
||||
- path: 2_build_model.py
|
||||
hash: md5
|
||||
md5: 3eb1a5110df6e25a23d8e8a92bb27823
|
||||
size: 5257
|
||||
md5: 039578b629d7cd204016e92cd079ea90
|
||||
size: 5181
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 7bcbf81a82015276e25749d1bc249a57.dir
|
||||
size: 21076961
|
||||
md5: ed19a11a85d6a2006631173f51569d27.dir
|
||||
size: 21131576
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/build_model.yaml:
|
||||
AutogluonAutoML:
|
||||
output_filepath: ./data/model/autogluonmodel/
|
||||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 800
|
||||
presets: medium_quality
|
||||
excluded_model_types:
|
||||
- KNN
|
||||
SKLearnLinearRegression:
|
||||
SKLearnSVMRegression:
|
||||
kernel: linear
|
||||
fit_metrics_filepath: ./metrics/fit_metrics.json
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
model_type: AutogluonAutoML
|
||||
default:
|
||||
build_model:
|
||||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
fit_metrics_filepath: ./metrics/fit_metrics.json
|
||||
SKLearnLinearRegression:
|
||||
SKLearnSVMRegression:
|
||||
kernel: linear
|
||||
AutogluonAutoML:
|
||||
output_filepath: ./data/model/autogluonmodel/
|
||||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 75
|
||||
presets: medium_quality
|
||||
excluded_model_types:
|
||||
- KNN
|
||||
outs:
|
||||
- path: data/model/
|
||||
hash: md5
|
||||
md5: 397c46c062b51034b6f8f3f229345de3.dir
|
||||
size: 334481421
|
||||
nfiles: 18
|
||||
md5: 60e253c42cc36934098c627ef3ef4cc1.dir
|
||||
size: 185134993
|
||||
nfiles: 14
|
||||
- path: metrics/fit_metrics.json
|
||||
hash: md5
|
||||
md5: f6e7e21d4229d4a229ea0a11f3023637
|
||||
size: 184
|
||||
md5: a0c2a1c9e5da0b857d510fa1ba6282a8
|
||||
size: 186
|
||||
generate_predictions:
|
||||
cmd: python 3_generate_predictions.py
|
||||
deps:
|
||||
- path: data/model
|
||||
hash: md5
|
||||
md5: 397c46c062b51034b6f8f3f229345de3.dir
|
||||
size: 334481421
|
||||
nfiles: 18
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 7bcbf81a82015276e25749d1bc249a57.dir
|
||||
size: 21076961
|
||||
nfiles: 2
|
||||
- path: 3_generate_predictions.py
|
||||
hash: md5
|
||||
md5: 874da2443ef0d92731e4c127f3ce4acb
|
||||
size: 4434
|
||||
md5: 238b3fa9f3c6f3720e77c116857070ae
|
||||
size: 4720
|
||||
- path: data/model
|
||||
hash: md5
|
||||
md5: 60e253c42cc36934098c627ef3ef4cc1.dir
|
||||
size: 185134993
|
||||
nfiles: 14
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: ed19a11a85d6a2006631173f51569d27.dir
|
||||
size: 21131576
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/generate_predictions.yaml:
|
||||
input_dataclient_type: local
|
||||
output_dataclient_type: local
|
||||
predictions_column_name: predictions
|
||||
predictions_output_filepath: ./data/predictions/predictions.parquet
|
||||
test_data_filepath: ./data/prepared_data/test.parquet
|
||||
configs/settings.yaml:
|
||||
default.generate_predictions.input_dataclient_type: local
|
||||
default.generate_predictions.output_dataclient_type: local
|
||||
default.generate_predictions.predictions_column_name: predictions
|
||||
default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet
|
||||
default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet
|
||||
outs:
|
||||
- path: data/predictions/
|
||||
hash: md5
|
||||
md5: 9c18005e722f0e428f4b83c3f974f206.dir
|
||||
size: 381870
|
||||
md5: 700c8767de3a86c4c5339daf3cc17192.dir
|
||||
size: 380962
|
||||
nfiles: 1
|
||||
generate_metrics:
|
||||
cmd: python 4_generate_metrics.py
|
||||
deps:
|
||||
- path: 4_generate_metrics.py
|
||||
hash: md5
|
||||
md5: 2c9fb78955a8c19cff0a098976f81d1b
|
||||
size: 4487
|
||||
- path: data/predictions
|
||||
hash: md5
|
||||
md5: 9c18005e722f0e428f4b83c3f974f206.dir
|
||||
size: 381870
|
||||
md5: 700c8767de3a86c4c5339daf3cc17192.dir
|
||||
size: 380962
|
||||
nfiles: 1
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 7bcbf81a82015276e25749d1bc249a57.dir
|
||||
size: 21076961
|
||||
md5: ed19a11a85d6a2006631173f51569d27.dir
|
||||
size: 21131576
|
||||
nfiles: 2
|
||||
- path: 4_generate_metrics.py
|
||||
hash: md5
|
||||
md5: 8ce0b6b55e1688fca816985e0cf37f28
|
||||
size: 4220
|
||||
params:
|
||||
configs/generate_metrics.yaml:
|
||||
dataclient_type: local
|
||||
metrics_output_filepath: ./metrics/metrics.json
|
||||
metrics_type: Regression
|
||||
configs/settings.yaml:
|
||||
default.generate_metrics.dataclient_type: local
|
||||
default.generate_metrics.metrics_output_filepath: ./metrics/metrics.json
|
||||
default.generate_metrics.metrics_type: Regression
|
||||
outs:
|
||||
- path: metrics/metrics.json
|
||||
hash: md5
|
||||
md5: 93d9b69d6cd951ae2c14b29ba92a2a38
|
||||
size: 186
|
||||
md5: 45ffac8f6e7283df4b69af8a9abc45e1
|
||||
size: 184
|
||||
startup_cleanup:
|
||||
cmd: python 0_startup_cleanup.py
|
||||
deps:
|
||||
- path: 0_startup_cleanup.py
|
||||
hash: md5
|
||||
md5: 2e51fbcac960d0f960bf32a8ec7486a0
|
||||
size: 1748
|
||||
md5: fbb7e3b1b98b517c870f3e1df3e7f695
|
||||
size: 1676
|
||||
params:
|
||||
configs/startup_cleanup.yaml:
|
||||
artefacts: ./data
|
||||
metrics: ./metrics
|
||||
configs/settings.yaml:
|
||||
default.startup_cleanup.artefacts: ./data
|
||||
default.startup_cleanup.metrics: ./metrics
|
||||
|
|
|
|||
|
|
@ -4,19 +4,28 @@ stages:
|
|||
deps:
|
||||
- 0_startup_cleanup.py
|
||||
params:
|
||||
- configs/startup_cleanup.yaml:
|
||||
- artefacts
|
||||
- metrics
|
||||
- configs/settings.yaml:
|
||||
- default.startup_cleanup.artefacts
|
||||
- default.startup_cleanup.metrics
|
||||
always_changed: true
|
||||
prepare_data:
|
||||
cmd: python 1_prepare_data.py
|
||||
deps:
|
||||
- 1_prepare_data.py
|
||||
params:
|
||||
- configs/prepare_data.yaml:
|
||||
- output_test_filepath
|
||||
- output_train_filepath
|
||||
- train_proportion
|
||||
- configs/settings.yaml:
|
||||
- default.prepare_data.input_dataclient_type
|
||||
- default.prepare_data.output_dataclient_type
|
||||
- default.prepare_data.data_filepath
|
||||
- default.prepare_data.train_proportion
|
||||
- default.prepare_data.output_train_filepath
|
||||
- default.prepare_data.output_test_filepath
|
||||
- default.feature_processor.feature_processor_type
|
||||
- default.feature_processor.feature_processor_config.subsample_amount
|
||||
- default.feature_processor.feature_processor_config.subsample_seed
|
||||
- default.feature_processor.feature_processor_config.target
|
||||
- default.feature_processor.feature_processor_config.drop_columns
|
||||
- default.feature_processor.feature_processor_config.retain_features
|
||||
outs:
|
||||
- data/prepared_data/
|
||||
always_changed: true
|
||||
|
|
@ -38,7 +47,12 @@ stages:
|
|||
- data/prepared_data
|
||||
- data/model
|
||||
params:
|
||||
- configs/generate_predictions.yaml:
|
||||
- configs/settings.yaml:
|
||||
- default.generate_predictions.input_dataclient_type
|
||||
- default.generate_predictions.output_dataclient_type
|
||||
- default.generate_predictions.test_data_filepath
|
||||
- default.generate_predictions.predictions_output_filepath
|
||||
- default.generate_predictions.predictions_column_name
|
||||
outs:
|
||||
- data/predictions/
|
||||
always_changed: true
|
||||
|
|
@ -49,7 +63,10 @@ stages:
|
|||
- data/prepared_data
|
||||
- data/predictions
|
||||
params:
|
||||
- configs/generate_metrics.yaml:
|
||||
- configs/settings.yaml:
|
||||
- default.generate_metrics.dataclient_type
|
||||
- default.generate_metrics.metrics_type
|
||||
- default.generate_metrics.metrics_output_filepath
|
||||
outs:
|
||||
- metrics/metrics.json
|
||||
always_changed: true
|
||||
|
|
|
|||
|
|
@ -3,8 +3,6 @@ Post Model generation step:
|
|||
We want to look at feature analysis of the model
|
||||
"""
|
||||
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from core.interface.InterfaceModels import MLModel
|
||||
from core.interface.InterfaceDataClient import DataClient
|
||||
from core.Logger import logger
|
||||
|
|
@ -13,27 +11,16 @@ from core.DataClient import dataclient_factory
|
|||
from alibi.explainers import PermutationImportance, plot_permutation_importance
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from config import settings
|
||||
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
client_params = settings.client
|
||||
prepare_data_params = settings.prepare_data
|
||||
feature_process_params = settings.feature_processor
|
||||
build_model_params = settings.build_model
|
||||
generate_predictions_params = settings.generate_predictions
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
|
||||
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
|
||||
build_model_params = yaml.safe_load(open(build_model_path))
|
||||
|
||||
model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml"
|
||||
model_analysis_params = yaml.safe_load(open(model_analysis_path))
|
||||
|
||||
generate_predictions_path = (
|
||||
Path(__file__).parent / "configs" / "generate_predictions.yaml"
|
||||
)
|
||||
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
|
||||
model_analysis_params = settings.model_analysis
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
model.load_model(build_model_params["model_save_filepath"])
|
||||
|
|
|
|||
|
|
@ -12,40 +12,21 @@ import shap
|
|||
|
||||
shap.initjs()
|
||||
|
||||
|
||||
import yaml
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from core.interface.InterfaceModels import MLModel
|
||||
from core.interface.InterfaceDataClient import DataClient
|
||||
from core.Logger import logger
|
||||
from core.MLModels import model_factory
|
||||
from core.DataClient import dataclient_factory
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from config import settings
|
||||
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml"
|
||||
feature_process_params = yaml.safe_load(open(feature_process_path))
|
||||
|
||||
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
|
||||
build_model_params = yaml.safe_load(open(build_model_path))
|
||||
|
||||
generate_predictions_path = (
|
||||
Path(__file__).parent / "configs" / "generate_predictions.yaml"
|
||||
)
|
||||
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
|
||||
|
||||
prediction_analysis_path = (
|
||||
Path(__file__).parent / "configs" / "prediction_analysis.yaml"
|
||||
)
|
||||
prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path))
|
||||
client_params = settings.client
|
||||
prepare_data_params = settings.prepare_data
|
||||
feature_process_params = settings.feature_processor
|
||||
build_model_params = settings.build_model
|
||||
generate_predictions_params = settings.generate_predictions
|
||||
prediction_analysis_params = settings.prediction_analysis
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
model.load_model(build_model_params["model_save_filepath"])
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue