diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml index a1307c1..d97cf22 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml @@ -1,5 +1,5 @@ -model_type: AutogluonAutoML -model_save_filepath: ./data/model/autogluonmodel/ +model_type: SKLearnLinearRegression +model_save_filepath: ./data/model/model.joblib SKLearnLinearRegression: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/client.yaml b/modules/ml-pipeline/src/pipeline/src/configs/client.yaml new file mode 100644 index 0000000..65dc7a2 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/src/configs/client.yaml @@ -0,0 +1,10 @@ +aws-s3: + AWS_ACCESS_KEY_ID: null + AWS_SECRET_ACCESS_KEY: null + ENDPOINT_URL: null +aws-s3-mock: + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + ENDPOINT_URL: http://localhost:9000 +local: + null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml index 233a329..18e6f84 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml @@ -4,5 +4,5 @@ feature_processor_config: subsample_seed: 0 target: RDSAP_CHANGE drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"] - # retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] - retain_features: null + retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] + # retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml index 736f5d2..af8a802 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml @@ -1,14 +1,8 @@ input_dataclient_type: aws-s3 -input_dataclient: - AWS_ACCESS_KEY_ID: null - AWS_SECRET_ACCESS_KEY: null - ENDPOINT_URL: null output_dataclient_type: local -output_dataclient: - null datahandler_type: parquet data_filepath: s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet -train_proportion: 0.8 +train_proportion: 0.1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index 01a400f..b5d7e23 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -5,18 +5,18 @@ stages: deps: - path: prepare_data.py hash: md5 - md5: 87a83e62512bff93c89f3e93c1ed248d - size: 5593 + md5: 9c31bfb1b75ea3c9685ec459cbb50e62 + size: 5921 params: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet output_train_filepath: ./data/prepared_data/train.parquet - train_proportion: 0.8 + train_proportion: 0.1 outs: - path: data/prepared_data/ hash: md5 - md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir - size: 4428909 + md5: f7e45d3997cf165904174b2bc2d2eba5.dir + size: 4396934 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 3948 - path: data/prepared_data hash: md5 - md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir - size: 4428909 + md5: f7e45d3997cf165904174b2bc2d2eba5.dir + size: 4396934 nfiles: 2 params: configs/build_model.yaml: @@ -42,31 +42,31 @@ stages: SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear - model_save_filepath: ./data/model/autogluonmodel/ - model_type: AutogluonAutoML + model_save_filepath: ./data/model/model.joblib + model_type: SKLearnLinearRegression outs: - path: data/model/ hash: md5 - md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir - size: 1264795580 - nfiles: 28 + md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir + size: 904 + nfiles: 1 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir - size: 1264795580 - nfiles: 28 + md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir + size: 904 + nfiles: 1 - path: data/prepared_data hash: md5 - md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir - size: 4428909 + md5: f7e45d3997cf165904174b2bc2d2eba5.dir + size: 4396934 nfiles: 2 - path: generate_predictions.py hash: md5 - md5: 76c45e7575ec979e6c4c8e2cf754a720 - size: 4225 + md5: 32c0ecd082e1f8fc4426338d6629979c + size: 4686 params: configs/generate_predictions.yaml: input_dataclient_type: local @@ -77,26 +77,26 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir - size: 672577 + md5: ea0431b600f0ef357de3a543482cefe7.dir + size: 4085105 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir - size: 672577 + md5: ea0431b600f0ef357de3a543482cefe7.dir + size: 4085105 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir - size: 4428909 + md5: f7e45d3997cf165904174b2bc2d2eba5.dir + size: 4396934 nfiles: 2 - path: generate_metrics.py hash: md5 - md5: cc368845f62523575a9ed5c791e27815 - size: 4329 + md5: 4709c42d93f8e717a3d9e4958e46cd76 + size: 4587 params: configs/generate_metrics.yaml: dataclient_type: local @@ -107,15 +107,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 3f03e50a419af6730351a5016e2ae98a - size: 182 + md5: ae53c4781cb8a754d24e29ba7ddb16ea + size: 183 startup_cleanup: cmd: python startup_cleanup.py deps: - path: startup_cleanup.py hash: md5 - md5: f7fe2ca33004b34530da0a3ab48c1790 - size: 1458 + md5: 2e51fbcac960d0f960bf32a8ec7486a0 + size: 1748 params: configs/startup_cleanup.yaml: artefacts: ./data diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.yaml b/modules/ml-pipeline/src/pipeline/src/dvc.yaml index 7e98535..42e8947 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/src/dvc.yaml @@ -7,6 +7,7 @@ stages: - configs/startup_cleanup.yaml: - artefacts - metrics + always_changed: true prepare_data: cmd: python prepare_data.py deps: diff --git a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py index a7def45..3a5c668 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py @@ -21,6 +21,9 @@ from core.Logger import logger RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" prepare_data_params = yaml.safe_load(open(prepare_data_path)) @@ -96,7 +99,12 @@ if __name__ == "__main__": logger.info("----------------------------") model = model_factory(build_model_params["model_type"]) - dataclient = dataclient_factory(generate_metrics_params["dataclient_type"]) + + dataclient_type = generate_metrics_params["dataclient_type"] + dataclient = dataclient_factory(dataclient_type) + dataclient.ingest_configurations(client_params[dataclient_type]) + dataclient.establish_client() + input_datahandler = datahandler_factory( generate_metrics_params["input_datahandler_type"] ) diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py index 552db47..48e192b 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py @@ -19,6 +19,9 @@ from core.Logger import logger RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" prepare_data_params = yaml.safe_load(open(prepare_data_path)) @@ -97,12 +100,19 @@ if __name__ == "__main__": # We may have different locations of loading hence why we use one specified in generate_predictions.yaml # I.e. for metric runs, this will be a local data client # For predictions, we will want a cloud data client - input_dataclient = dataclient_factory( - generate_predictions_params["input_dataclient_type"] - ) - output_dataclient = dataclient_factory( - generate_predictions_params["output_dataclient_type"] + + input_dataclient_type = generate_predictions_params["input_dataclient_type"] + input_dataclient = dataclient_factory(input_dataclient_type) + input_dataclient.ingest_configurations(config=client_params[input_dataclient_type]) + input_dataclient.establish_client() + + output_dataclient_type = generate_predictions_params["output_dataclient_type"] + output_dataclient = dataclient_factory(output_dataclient_type) + output_dataclient.ingest_configurations( + config=client_params[output_dataclient_type] ) + output_dataclient.establish_client() + datahandler = datahandler_factory(prepare_data_params["datahandler_type"]) generate_predictions( diff --git a/modules/ml-pipeline/src/pipeline/src/prepare_data.py b/modules/ml-pipeline/src/pipeline/src/prepare_data.py index 6df07fb..400adbf 100644 --- a/modules/ml-pipeline/src/pipeline/src/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/src/prepare_data.py @@ -21,6 +21,9 @@ from core.FeatureProcessor import feature_processor_factory RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" prepare_data_params = yaml.safe_load(open(prepare_data_path)) @@ -94,7 +97,7 @@ def prepare_data( dataclient=output_dataclient, obj=train, location=output_train_filepath ) - if test: + if test is not None: datahandler.save_data( dataclient=output_dataclient, obj=test, location=output_test_filepath ) @@ -112,18 +115,17 @@ if __name__ == "__main__": logger.info(f"--- Initiate DataClient ---") logger.info("----------------------------") - input_dataclient = dataclient_factory(prepare_data_params["input_dataclient_type"]) - output_dataclient = dataclient_factory( - prepare_data_params["output_dataclient_type"] - ) + input_dataclient_type = prepare_data_params["input_dataclient_type"] + output_dataclient_type = prepare_data_params["output_dataclient_type"] - input_dataclient.ingest_configurations( - config=prepare_data_params["input_dataclient"] - ) + input_dataclient = dataclient_factory(input_dataclient_type) + output_dataclient = dataclient_factory(output_dataclient_type) + + input_dataclient.ingest_configurations(config=client_params[input_dataclient_type]) input_dataclient.establish_client() output_dataclient.ingest_configurations( - config=prepare_data_params["output_dataclient"] + config=client_params[output_dataclient_type] ) output_dataclient.establish_client()