diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml index d97cf22..a1307c1 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml @@ -1,5 +1,5 @@ -model_type: SKLearnLinearRegression -model_save_filepath: ./data/model/model.joblib +model_type: AutogluonAutoML +model_save_filepath: ./data/model/autogluonmodel/ SKLearnLinearRegression: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml index 18e6f84..9aa02f0 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml @@ -3,6 +3,6 @@ feature_processor_config: subsample_amount: null subsample_seed: 0 target: RDSAP_CHANGE - drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"] - retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] - # retain_features: null + drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"] + # retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "TOTAL_FLOOR_AREA_ENDING"] + retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml index af8a802..273e78d 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml @@ -1,7 +1,7 @@ input_dataclient_type: aws-s3 output_dataclient_type: local datahandler_type: parquet -data_filepath: s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet +data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet train_proportion: 0.1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index b5d7e23..f04423a 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -15,8 +15,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: f7e45d3997cf165904174b2bc2d2eba5.dir - size: 4396934 + md5: febdc8362200167078dfa578cf2bc889.dir + size: 24296908 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 3948 - path: data/prepared_data hash: md5 - md5: f7e45d3997cf165904174b2bc2d2eba5.dir - size: 4396934 + md5: febdc8362200167078dfa578cf2bc889.dir + size: 24296908 nfiles: 2 params: configs/build_model.yaml: @@ -42,26 +42,26 @@ stages: SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear - model_save_filepath: ./data/model/model.joblib - model_type: SKLearnLinearRegression + model_save_filepath: ./data/model/autogluonmodel/ + model_type: AutogluonAutoML outs: - path: data/model/ hash: md5 - md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir - size: 904 - nfiles: 1 + md5: 154f823d56a9892948a633789d9b08a5.dir + size: 680552724 + nfiles: 18 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir - size: 904 - nfiles: 1 + md5: 154f823d56a9892948a633789d9b08a5.dir + size: 680552724 + nfiles: 18 - path: data/prepared_data hash: md5 - md5: f7e45d3997cf165904174b2bc2d2eba5.dir - size: 4396934 + md5: febdc8362200167078dfa578cf2bc889.dir + size: 24296908 nfiles: 2 - path: generate_predictions.py hash: md5 @@ -77,21 +77,21 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: ea0431b600f0ef357de3a543482cefe7.dir - size: 4085105 + md5: d8abefde18d78588158ef6acf282e2ed.dir + size: 2948553 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: ea0431b600f0ef357de3a543482cefe7.dir - size: 4085105 + md5: d8abefde18d78588158ef6acf282e2ed.dir + size: 2948553 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f7e45d3997cf165904174b2bc2d2eba5.dir - size: 4396934 + md5: febdc8362200167078dfa578cf2bc889.dir + size: 24296908 nfiles: 2 - path: generate_metrics.py hash: md5 @@ -107,8 +107,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: ae53c4781cb8a754d24e29ba7ddb16ea - size: 183 + md5: f5aaae75ea74241500cd1ce76751c579 + size: 182 startup_cleanup: cmd: python startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.yaml b/modules/ml-pipeline/src/pipeline/src/dvc.yaml index 42e8947..afaaa71 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/src/dvc.yaml @@ -19,6 +19,7 @@ stages: - train_proportion outs: - data/prepared_data/ + always_changed: true build_model: cmd: python build_model.py deps: @@ -28,6 +29,7 @@ stages: - configs/build_model.yaml: outs: - data/model/ + always_changed: true generate_predictions: cmd: python generate_predictions.py deps: @@ -38,6 +40,7 @@ stages: - configs/generate_predictions.yaml: outs: - data/predictions/ + always_changed: true generate_metrics: cmd: python generate_metrics.py deps: @@ -48,5 +51,6 @@ stages: - configs/generate_metrics.yaml: outs: - metrics/metrics.json + always_changed: true metrics: - metrics/metrics.json