diff --git a/modules/ml-pipeline/src/pipeline/.gitignore b/modules/ml-pipeline/src/pipeline/.gitignore new file mode 100644 index 0000000..bf035d2 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/.gitignore @@ -0,0 +1,3 @@ + +# Ignore dynaconf secret files +.secrets.* diff --git a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py index af63291..0bfa37f 100644 --- a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py +++ b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py @@ -6,9 +6,9 @@ import shutil import yaml from pathlib import Path from core.Logger import logger +from config import settings -startup_cleanup_path = Path(__file__).parent / "configs" / "startup_cleanup.yaml" -startup_cleanup_params = yaml.safe_load(open(startup_cleanup_path)) +startup_cleanup_params = settings.startup_cleanup def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index dac3aeb..32daa19 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -15,6 +15,7 @@ from configs.feature_processor_logic import business_logic, new_feature_funcs from core.Logger import logger from core.DataClient import dataclient_factory from core.FeatureProcessor import feature_processor_factory +from config import settings logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") @@ -22,14 +23,9 @@ logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") -client_path = Path(__file__).parent / "configs" / "client.yaml" -client_params = yaml.safe_load(open(client_path)) - -prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" -prepare_data_params = yaml.safe_load(open(prepare_data_path)) - -feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" -feature_process_params = yaml.safe_load(open(feature_process_path)) +client_params = settings.client +prepare_data_params = settings.prepare_data +feature_process_params = settings.feature_processor data_filepath = prepare_data_params["data_filepath"] train_proportion = prepare_data_params["train_proportion"] diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index d01dc2b..f7746f9 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -16,6 +16,7 @@ from core.DataClient import dataclient_factory from core.MLModels import model_factory from core.MLMetrics import metrics_factory from configs.post_prediction_logic import post_prediction_logic +from config import settings logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") @@ -23,17 +24,10 @@ logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") -prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" -prepare_data_params = yaml.safe_load(open(prepare_data_path)) - -build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" -build_model_params = yaml.safe_load(open(build_model_path)) - -feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" -feature_process_params = yaml.safe_load(open(feature_process_path)) - -generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml" -generate_metrics_params = yaml.safe_load(open(generate_metrics_path)) +prepare_data_params = settings.prepare_data +build_model_params = settings.build_model +feature_process_params = settings.feature_processor +generate_metrics_params = settings.generate_metrics model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] @@ -149,8 +143,8 @@ if __name__ == "__main__": metrics=metrics, target=target, model_save_location=model_save_location, - model_hyperparameters=build_model_params[model_type], - train_filepath=model_hyperparameters, + model_hyperparameters=model_hyperparameters, + train_filepath=train_filepath, test_filepath=test_filepath, fit_metrics_filepath=fit_metrics_filepath, ) diff --git a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py index d9899f1..f977d9a 100644 --- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py @@ -13,7 +13,7 @@ from core.DataClient import dataclient_factory from core.MLModels import model_factory from core.Logger import logger from configs.post_prediction_logic import post_prediction_logic - +from config import settings logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") @@ -21,26 +21,20 @@ logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") -client_path = Path(__file__).parent / "configs" / "client.yaml" -client_params = yaml.safe_load(open(client_path)) +client_params = settings.client +prepare_data_params = settings.prepare_data +build_model_params = settings.build_model +generate_predictions_params = settings.generate_predictions +feature_process_params = settings.feature_processor -prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" -prepare_data_params = yaml.safe_load(open(prepare_data_path)) +input_dataclient_type = generate_predictions_params["input_dataclient_type"] +output_dataclient_type = generate_predictions_params["output_dataclient_type"] -build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" -build_model_params = yaml.safe_load(open(build_model_path)) - -generate_predictions_path = ( - Path(__file__).parent / "configs" / "generate_predictions.yaml" -) -generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) - -feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" -feature_process_params = yaml.safe_load(open(feature_process_path)) +test_data_filepath = generate_predictions_params["test_data_filepath"] +test_data_filepath = os.environ.get("PREDICTION_FILE", test_data_filepath) target = feature_process_params["feature_processor_config"]["target"] model_filepath = build_model_params["model_save_filepath"] -test_data_filepath = generate_predictions_params["test_data_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] predictions_column_name = generate_predictions_params["predictions_column_name"] @@ -57,13 +51,11 @@ logger.info("----------------------------") # We may have different locations of loading hence why we use one specified in generate_predictions.yaml # I.e. for metric runs, this will be a local data client # For predictions, we will want a cloud data client -input_dataclient_type = generate_predictions_params["input_dataclient_type"] input_dataclient = dataclient_factory( dataclient_type=input_dataclient_type, dataclient_config=client_params[input_dataclient_type], ) -output_dataclient_type = generate_predictions_params["output_dataclient_type"] output_dataclient = dataclient_factory( dataclient_type=output_dataclient_type, dataclient_config=client_params[output_dataclient_type], diff --git a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py index 709ce53..7b115a2 100644 --- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py @@ -14,7 +14,7 @@ from core.DataClient import dataclient_factory from core.MLModels import model_factory from core.MLMetrics import metrics_factory from core.Logger import logger - +from config import settings logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") @@ -22,25 +22,12 @@ logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") -client_path = Path(__file__).parent / "configs" / "client.yaml" -client_params = yaml.safe_load(open(client_path)) - -prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" -prepare_data_params = yaml.safe_load(open(prepare_data_path)) - -build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" -build_model_params = yaml.safe_load(open(build_model_path)) - -generate_predictions_path = ( - Path(__file__).parent / "configs" / "generate_predictions.yaml" -) -generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) - -generate_metrics_path = Path(__file__).parent / "configs" / "generate_metrics.yaml" -generate_metrics_params = yaml.safe_load(open(generate_metrics_path)) - -feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" -feature_process_params = yaml.safe_load(open(feature_process_path)) +client_params = settings.client +prepare_data_params = settings.prepare_data +build_model_params = settings.build_model +generate_predictions_params = settings.generate_predictions +generate_metrics_params = settings.generate_metrics +feature_process_params = settings.feature_processor target = feature_process_params["feature_processor_config"]["target"] test_data_filepath = generate_predictions_params["test_data_filepath"] diff --git a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet index b0c328f..6960946 100644 Binary files a/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet and b/modules/ml-pipeline/src/pipeline/analysis/feature_importance.parquet differ diff --git a/modules/ml-pipeline/src/pipeline/config.py b/modules/ml-pipeline/src/pipeline/config.py new file mode 100644 index 0000000..7a7366b --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/config.py @@ -0,0 +1,14 @@ +from dynaconf import Dynaconf + +settings = Dynaconf( + environments=True, + envvar_prefix="DYNACONF", + settings_files=[ + "./configs/settings.yaml", + "./configs/build_model.yaml", + "./configs/analysis.yaml", + ], +) + +# `envvar_prefix` = export envvars with `export DYNACONF_FOO=bar`. +# `settings_files` = Load these files in the order. diff --git a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml new file mode 100644 index 0000000..5c6e749 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml @@ -0,0 +1,16 @@ +default: + model_analysis: + dataclient_type: local + feature_importance_filepath: ./analysis/feature_importance.parquet + permutation_subsample_amount: 1000 + loss_fns: "mean_absolute_percentage_error" + feature_importance_column: importance + n_repeats: 5 + figwidth: 7 + figheight: 6 + + prediction_analysis: + dataclient_type: local + nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower + n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower + row_index: [0, 10, 20] # index of an example datapoint diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 08108fb..c8c022d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -1,16 +1,18 @@ -model_type: AutogluonAutoML -model_save_filepath: ./data/model/autogluonmodel/ -fit_metrics_filepath: ./metrics/fit_metrics.json +default: + build_model: + model_type: AutogluonAutoML + model_save_filepath: ./data/model/autogluonmodel/ + fit_metrics_filepath: ./metrics/fit_metrics.json -SKLearnLinearRegression: null + SKLearnLinearRegression: null -SKLearnSVMRegression: - kernel: "linear" + SKLearnSVMRegression: + kernel: "linear" -AutogluonAutoML: - output_filepath: ./data/model/autogluonmodel/ - problem_type: regression - eval_metric: mean_absolute_error - time_limit: 800 - presets: medium_quality - excluded_model_types: ['KNN'] + AutogluonAutoML: + output_filepath: ./data/model/autogluonmodel/ + problem_type: regression + eval_metric: mean_absolute_error + time_limit: 75 + presets: medium_quality + excluded_model_types: ['KNN'] diff --git a/modules/ml-pipeline/src/pipeline/configs/client.yaml b/modules/ml-pipeline/src/pipeline/configs/client.yaml deleted file mode 100644 index 65dc7a2..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/client.yaml +++ /dev/null @@ -1,10 +0,0 @@ -aws-s3: - AWS_ACCESS_KEY_ID: null - AWS_SECRET_ACCESS_KEY: null - ENDPOINT_URL: null -aws-s3-mock: - AWS_ACCESS_KEY_ID: minio - AWS_SECRET_ACCESS_KEY: minio123 - ENDPOINT_URL: http://localhost:9000 -local: - null diff --git a/modules/ml-pipeline/src/pipeline/configs/configs.py b/modules/ml-pipeline/src/pipeline/configs/configs.py deleted file mode 100644 index d657121..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/configs.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Stitch all yaml configuration files together, override some settings (such as bucket location) based off environment variables -""" diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml deleted file mode 100644 index 74d1823..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml +++ /dev/null @@ -1,61 +0,0 @@ -feature_processor_type: dataframe -feature_processor_config: - subsample_amount: null - subsample_seed: 0 - target: SAP_ENDING - drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] - # retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"] - # retain_features: null -# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS', -# 'NUMBER_HEATED_ROOMS', -# 'FIXED_LIGHTING_OUTLETS_COUNT', -# 'CONSTRUCTION_AGE_BAND', -# 'TRANSACTION_TYPE_STARTING', -# 'LIGHTING_DESCRIPTION_STARTING', -# 'MAINHEAT_DESCRIPTION_STARTING', -# 'HOTWATER_DESCRIPTION_STARTING', -# 'MAIN_FUEL_STARTING', -# 'MECHANICAL_VENTILATION_STARTING', -# 'SECONDHEAT_DESCRIPTION_STARTING', -# 'ENERGY_TARIFF_STARTING', -# 'SOLAR_WATER_HEATING_FLAG_STARTING', -# 'PHOTO_SUPPLY_STARTING', -# 'WINDOWS_DESCRIPTION_STARTING', -# 'GLAZED_TYPE_STARTING', -# 'MULTI_GLAZE_PROPORTION_STARTING', -# 'LOW_ENERGY_LIGHTING_STARTING', -# 'NUMBER_OPEN_FIREPLACES_STARTING', -# 'MAINHEATCONT_DESCRIPTION_STARTING', -# 'EXTENSION_COUNT_STARTING', -# 'TOTAL_FLOOR_AREA_STARTING', -# 'FLOOR_HEIGHT_STARTING', -# 'DAYS_TO_STARTING', -# 'WALLS_DESCRIPTION_STARTING', -# 'FLOOR_DESCRIPTION_STARTING'] -# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS', -# 'NUMBER_HEATED_ROOMS', -# 'FIXED_LIGHTING_OUTLETS_COUNT', -# 'CONSTRUCTION_AGE_BAND', -# 'TRANSACTION_TYPE_ENDING', -# 'LIGHTING_DESCRIPTION_ENDING', -# 'MAINHEAT_DESCRIPTION_ENDING', -# 'HOTWATER_DESCRIPTION_ENDING', -# 'MAIN_FUEL_ENDING', -# 'MECHANICAL_VENTILATION_ENDING', -# 'SECONDHEAT_DESCRIPTION_ENDING', -# 'ENERGY_TARIFF_ENDING', -# 'SOLAR_WATER_HEATING_FLAG_ENDING', -# 'PHOTO_SUPPLY_ENDING', -# 'WINDOWS_DESCRIPTION_ENDING', -# 'GLAZED_TYPE_ENDING', -# 'MULTI_GLAZE_PROPORTION_ENDING', -# 'LOW_ENERGY_LIGHTING_ENDING', -# 'NUMBER_OPEN_FIREPLACES_ENDING', -# 'MAINHEATCONT_DESCRIPTION_ENDING', -# 'EXTENSION_COUNT_ENDING', -# 'TOTAL_FLOOR_AREA_ENDING', -# 'FLOOR_HEIGHT_ENDING', -# 'DAYS_TO_ENDING', -# 'WALLS_DESCRIPTION_ENDING', -# 'FLOOR_DESCRIPTION_ENDING'] - retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml deleted file mode 100644 index 7ed9819..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml +++ /dev/null @@ -1,3 +0,0 @@ -dataclient_type: local -metrics_type: Regression -metrics_output_filepath: ./metrics/metrics.json diff --git a/modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml deleted file mode 100644 index 404c33f..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml +++ /dev/null @@ -1,5 +0,0 @@ -input_dataclient_type: local -output_dataclient_type: local -test_data_filepath: ./data/prepared_data/test.parquet -predictions_output_filepath: ./data/predictions/predictions.parquet -predictions_column_name: predictions diff --git a/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml b/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml deleted file mode 100644 index de18ba8..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/model_analysis.yaml +++ /dev/null @@ -1,8 +0,0 @@ -dataclient_type: local -feature_importance_filepath: ./analysis/feature_importance.parquet -permutation_subsample_amount: 1000 -loss_fns: "mean_absolute_percentage_error" -feature_importance_column: importance -n_repeats: 5 -figwidth: 7 -figheight: 6 diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 903da7d..b85d3a4 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -11,6 +11,7 @@ def clip_predictions_to_minimum_value( series_name = predictions.name predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) + # We expect all prediction to be atleast one point improvement replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"] predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value diff --git a/modules/ml-pipeline/src/pipeline/configs/prediction_analysis.yaml b/modules/ml-pipeline/src/pipeline/configs/prediction_analysis.yaml deleted file mode 100644 index 52fd5fc..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/prediction_analysis.yaml +++ /dev/null @@ -1,4 +0,0 @@ -dataclient_type: local -nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower -n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower -row_index: [0, 10, 20] # index of an example datapoint diff --git a/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml deleted file mode 100644 index bbcf3f8..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml +++ /dev/null @@ -1,9 +0,0 @@ -input_dataclient_type: aws-s3 -output_dataclient_type: local -# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet -data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet -train_proportion: 0.9 -output_train_filepath: ./data/prepared_data/train.parquet -output_test_filepath: ./data/prepared_data/test.parquet - -# cache_o diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml new file mode 100644 index 0000000..588dd9a --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -0,0 +1,49 @@ +default: + startup_cleanup: + artefacts: ./data + metrics: ./metrics + + client: + aws-s3: + AWS_ACCESS_KEY_ID: null # Use local credentials + AWS_SECRET_ACCESS_KEY: null # Use local credentials + ENDPOINT_URL: null # Use local credentials + aws-s3-mock: + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + ENDPOINT_URL: http://localhost:9000 + local: + null + + prepare_data: + input_dataclient_type: aws-s3 + output_dataclient_type: local + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet + train_proportion: 0.9 + output_train_filepath: ./data/prepared_data/train.parquet + output_test_filepath: ./data/prepared_data/test.parquet + + feature_processor: + feature_processor_type: dataframe + feature_processor_config: + subsample_amount: null + subsample_seed: 0 + target: SAP_ENDING + drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + retain_features: null + + generate_predictions: + input_dataclient_type: local + output_dataclient_type: local + test_data_filepath: ./data/prepared_data/test.parquet + predictions_output_filepath: ./data/predictions/predictions.parquet + predictions_column_name: predictions + + generate_metrics: + dataclient_type: local + metrics_type: Regression + metrics_output_filepath: ./metrics/metrics.json + +dev: + generate_predictions: + input_dataclient_type: aws-s3 diff --git a/modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml b/modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml deleted file mode 100644 index 909fb4b..0000000 --- a/modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml +++ /dev/null @@ -1,2 +0,0 @@ -artefacts: ./data -metrics: ./metrics diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index d296bd2..aeb705d 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,122 +5,139 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 2648d7d407dca857a1d20a11a88d3d98 - size: 5116 + md5: c9f030df733e318b80d1fa91b7732f79 + size: 5132 params: - configs/prepare_data.yaml: - output_test_filepath: ./data/prepared_data/test.parquet - output_train_filepath: ./data/prepared_data/train.parquet - train_proportion: 0.9 + configs/settings.yaml: + default.feature_processor.feature_processor_config.drop_columns: + - UPRN + - HEAT_DEMAND_CHANGE + - CARBON_CHANGE + - RDSAP_CHANGE + - HEAT_DEMAND_ENDING + - CARBON_ENDING + default.feature_processor.feature_processor_config.retain_features: + default.feature_processor.feature_processor_config.subsample_amount: + default.feature_processor.feature_processor_config.subsample_seed: 0 + default.feature_processor.feature_processor_config.target: SAP_ENDING + default.feature_processor.feature_processor_type: dataframe + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet + default.prepare_data.input_dataclient_type: aws-s3 + default.prepare_data.output_dataclient_type: local + default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet + default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet + default.prepare_data.train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 - md5: 7bcbf81a82015276e25749d1bc249a57.dir - size: 21076961 + md5: ed19a11a85d6a2006631173f51569d27.dir + size: 21131576 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: 3eb1a5110df6e25a23d8e8a92bb27823 - size: 5257 + md5: 039578b629d7cd204016e92cd079ea90 + size: 5181 - path: data/prepared_data hash: md5 - md5: 7bcbf81a82015276e25749d1bc249a57.dir - size: 21076961 + md5: ed19a11a85d6a2006631173f51569d27.dir + size: 21131576 nfiles: 2 params: configs/build_model.yaml: - AutogluonAutoML: - output_filepath: ./data/model/autogluonmodel/ - problem_type: regression - eval_metric: mean_absolute_error - time_limit: 800 - presets: medium_quality - excluded_model_types: - - KNN - SKLearnLinearRegression: - SKLearnSVMRegression: - kernel: linear - fit_metrics_filepath: ./metrics/fit_metrics.json - model_save_filepath: ./data/model/autogluonmodel/ - model_type: AutogluonAutoML + default: + build_model: + model_type: AutogluonAutoML + model_save_filepath: ./data/model/autogluonmodel/ + fit_metrics_filepath: ./metrics/fit_metrics.json + SKLearnLinearRegression: + SKLearnSVMRegression: + kernel: linear + AutogluonAutoML: + output_filepath: ./data/model/autogluonmodel/ + problem_type: regression + eval_metric: mean_absolute_error + time_limit: 75 + presets: medium_quality + excluded_model_types: + - KNN outs: - path: data/model/ hash: md5 - md5: 397c46c062b51034b6f8f3f229345de3.dir - size: 334481421 - nfiles: 18 + md5: 60e253c42cc36934098c627ef3ef4cc1.dir + size: 185134993 + nfiles: 14 - path: metrics/fit_metrics.json hash: md5 - md5: f6e7e21d4229d4a229ea0a11f3023637 - size: 184 + md5: a0c2a1c9e5da0b857d510fa1ba6282a8 + size: 186 generate_predictions: cmd: python 3_generate_predictions.py deps: - - path: data/model - hash: md5 - md5: 397c46c062b51034b6f8f3f229345de3.dir - size: 334481421 - nfiles: 18 - - path: data/prepared_data - hash: md5 - md5: 7bcbf81a82015276e25749d1bc249a57.dir - size: 21076961 - nfiles: 2 - path: 3_generate_predictions.py hash: md5 - md5: 874da2443ef0d92731e4c127f3ce4acb - size: 4434 + md5: 238b3fa9f3c6f3720e77c116857070ae + size: 4720 + - path: data/model + hash: md5 + md5: 60e253c42cc36934098c627ef3ef4cc1.dir + size: 185134993 + nfiles: 14 + - path: data/prepared_data + hash: md5 + md5: ed19a11a85d6a2006631173f51569d27.dir + size: 21131576 + nfiles: 2 params: - configs/generate_predictions.yaml: - input_dataclient_type: local - output_dataclient_type: local - predictions_column_name: predictions - predictions_output_filepath: ./data/predictions/predictions.parquet - test_data_filepath: ./data/prepared_data/test.parquet + configs/settings.yaml: + default.generate_predictions.input_dataclient_type: local + default.generate_predictions.output_dataclient_type: local + default.generate_predictions.predictions_column_name: predictions + default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet + default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet outs: - path: data/predictions/ hash: md5 - md5: 9c18005e722f0e428f4b83c3f974f206.dir - size: 381870 + md5: 700c8767de3a86c4c5339daf3cc17192.dir + size: 380962 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py deps: + - path: 4_generate_metrics.py + hash: md5 + md5: 2c9fb78955a8c19cff0a098976f81d1b + size: 4487 - path: data/predictions hash: md5 - md5: 9c18005e722f0e428f4b83c3f974f206.dir - size: 381870 + md5: 700c8767de3a86c4c5339daf3cc17192.dir + size: 380962 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 7bcbf81a82015276e25749d1bc249a57.dir - size: 21076961 + md5: ed19a11a85d6a2006631173f51569d27.dir + size: 21131576 nfiles: 2 - - path: 4_generate_metrics.py - hash: md5 - md5: 8ce0b6b55e1688fca816985e0cf37f28 - size: 4220 params: - configs/generate_metrics.yaml: - dataclient_type: local - metrics_output_filepath: ./metrics/metrics.json - metrics_type: Regression + configs/settings.yaml: + default.generate_metrics.dataclient_type: local + default.generate_metrics.metrics_output_filepath: ./metrics/metrics.json + default.generate_metrics.metrics_type: Regression outs: - path: metrics/metrics.json hash: md5 - md5: 93d9b69d6cd951ae2c14b29ba92a2a38 - size: 186 + md5: 45ffac8f6e7283df4b69af8a9abc45e1 + size: 184 startup_cleanup: cmd: python 0_startup_cleanup.py deps: - path: 0_startup_cleanup.py hash: md5 - md5: 2e51fbcac960d0f960bf32a8ec7486a0 - size: 1748 + md5: fbb7e3b1b98b517c870f3e1df3e7f695 + size: 1676 params: - configs/startup_cleanup.yaml: - artefacts: ./data - metrics: ./metrics + configs/settings.yaml: + default.startup_cleanup.artefacts: ./data + default.startup_cleanup.metrics: ./metrics diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index e2969f6..ccdd779 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -4,19 +4,28 @@ stages: deps: - 0_startup_cleanup.py params: - - configs/startup_cleanup.yaml: - - artefacts - - metrics + - configs/settings.yaml: + - default.startup_cleanup.artefacts + - default.startup_cleanup.metrics always_changed: true prepare_data: cmd: python 1_prepare_data.py deps: - 1_prepare_data.py params: - - configs/prepare_data.yaml: - - output_test_filepath - - output_train_filepath - - train_proportion + - configs/settings.yaml: + - default.prepare_data.input_dataclient_type + - default.prepare_data.output_dataclient_type + - default.prepare_data.data_filepath + - default.prepare_data.train_proportion + - default.prepare_data.output_train_filepath + - default.prepare_data.output_test_filepath + - default.feature_processor.feature_processor_type + - default.feature_processor.feature_processor_config.subsample_amount + - default.feature_processor.feature_processor_config.subsample_seed + - default.feature_processor.feature_processor_config.target + - default.feature_processor.feature_processor_config.drop_columns + - default.feature_processor.feature_processor_config.retain_features outs: - data/prepared_data/ always_changed: true @@ -38,7 +47,12 @@ stages: - data/prepared_data - data/model params: - - configs/generate_predictions.yaml: + - configs/settings.yaml: + - default.generate_predictions.input_dataclient_type + - default.generate_predictions.output_dataclient_type + - default.generate_predictions.test_data_filepath + - default.generate_predictions.predictions_output_filepath + - default.generate_predictions.predictions_column_name outs: - data/predictions/ always_changed: true @@ -49,7 +63,10 @@ stages: - data/prepared_data - data/predictions params: - - configs/generate_metrics.yaml: + - configs/settings.yaml: + - default.generate_metrics.dataclient_type + - default.generate_metrics.metrics_type + - default.generate_metrics.metrics_output_filepath outs: - metrics/metrics.json always_changed: true diff --git a/modules/ml-pipeline/src/pipeline/model_analysis.py b/modules/ml-pipeline/src/pipeline/model_analysis.py index fb1f23c..3a77729 100644 --- a/modules/ml-pipeline/src/pipeline/model_analysis.py +++ b/modules/ml-pipeline/src/pipeline/model_analysis.py @@ -3,8 +3,6 @@ Post Model generation step: We want to look at feature analysis of the model """ -import yaml -from pathlib import Path from core.interface.InterfaceModels import MLModel from core.interface.InterfaceDataClient import DataClient from core.Logger import logger @@ -13,27 +11,16 @@ from core.DataClient import dataclient_factory from alibi.explainers import PermutationImportance, plot_permutation_importance import numpy as np import pandas as pd +from config import settings -client_path = Path(__file__).parent / "configs" / "client.yaml" -client_params = yaml.safe_load(open(client_path)) +client_params = settings.client +prepare_data_params = settings.prepare_data +feature_process_params = settings.feature_processor +build_model_params = settings.build_model +generate_predictions_params = settings.generate_predictions -prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" -prepare_data_params = yaml.safe_load(open(prepare_data_path)) - -feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" -feature_process_params = yaml.safe_load(open(feature_process_path)) - -build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" -build_model_params = yaml.safe_load(open(build_model_path)) - -model_analysis_path = Path(__file__).parent / "configs" / "model_analysis.yaml" -model_analysis_params = yaml.safe_load(open(model_analysis_path)) - -generate_predictions_path = ( - Path(__file__).parent / "configs" / "generate_predictions.yaml" -) -generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) +model_analysis_params = settings.model_analysis model = model_factory(build_model_params["model_type"]) model.load_model(build_model_params["model_save_filepath"]) diff --git a/modules/ml-pipeline/src/pipeline/prediction_analysis.py b/modules/ml-pipeline/src/pipeline/prediction_analysis.py index 8947916..9555693 100644 --- a/modules/ml-pipeline/src/pipeline/prediction_analysis.py +++ b/modules/ml-pipeline/src/pipeline/prediction_analysis.py @@ -12,40 +12,21 @@ import shap shap.initjs() - -import yaml from typing import List -from pathlib import Path from core.interface.InterfaceModels import MLModel from core.interface.InterfaceDataClient import DataClient from core.Logger import logger from core.MLModels import model_factory from core.DataClient import dataclient_factory -import numpy as np import pandas as pd +from config import settings - -client_path = Path(__file__).parent / "configs" / "client.yaml" -client_params = yaml.safe_load(open(client_path)) - -prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" -prepare_data_params = yaml.safe_load(open(prepare_data_path)) - -feature_process_path = Path(__file__).parent / "configs" / "feature_processor.yaml" -feature_process_params = yaml.safe_load(open(feature_process_path)) - -build_model_path = Path(__file__).parent / "configs" / "build_model.yaml" -build_model_params = yaml.safe_load(open(build_model_path)) - -generate_predictions_path = ( - Path(__file__).parent / "configs" / "generate_predictions.yaml" -) -generate_predictions_params = yaml.safe_load(open(generate_predictions_path)) - -prediction_analysis_path = ( - Path(__file__).parent / "configs" / "prediction_analysis.yaml" -) -prediction_analysis_params = yaml.safe_load(open(prediction_analysis_path)) +client_params = settings.client +prepare_data_params = settings.prepare_data +feature_process_params = settings.feature_processor +build_model_params = settings.build_model +generate_predictions_params = settings.generate_predictions +prediction_analysis_params = settings.prediction_analysis model = model_factory(build_model_params["model_type"]) model.load_model(build_model_params["model_save_filepath"])