From c0dc934be6a61e39c6aae6956c396208eb4c66d1 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 27 Mar 2024 23:10:36 +0000 Subject: [PATCH] run carbon model with new data --- MODEL_REGISTRY.md | 4 +- modules/ml-pipeline/src/README.md | 2 +- modules/ml-pipeline/src/pipeline/.gitignore | 1 + .../src/pipeline/configs/build_model.yaml | 1 + .../configs/feature_processor_logic.py | 54 +++++++---- .../pipeline/configs/post_prediction_logic.py | 15 +-- .../src/pipeline/configs/settings.yaml | 15 ++- .../ml-pipeline/src/pipeline/core/MLModels.py | 4 +- modules/ml-pipeline/src/pipeline/dvc.lock | 91 +++++++++++-------- .../predictions/requirements-dev.txt | 4 +- .../requirements/predictions/requirements.txt | 4 +- .../training/requirements-dev.txt | 7 +- .../requirements/training/requirements.txt | 4 +- 13 files changed, 119 insertions(+), 87 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 1bcceec..820b75a 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,9 +8,9 @@ "active": true }, "sap": { - "version": "v0.5.0", + "version": "v0.4.0", "stage": { - "dev": "v0.5.0" + "dev": "v0.4.0" }, "registered": true, "active": true diff --git a/modules/ml-pipeline/src/README.md b/modules/ml-pipeline/src/README.md index d7afc6a..db1b8b4 100644 --- a/modules/ml-pipeline/src/README.md +++ b/modules/ml-pipeline/src/README.md @@ -1,3 +1,3 @@ -# The generic reproducible ML-pipeline +# The generic reproducible ML-pipeline! Pipeline required to build a model to produce an output, that gets hashed via DVC diff --git a/modules/ml-pipeline/src/pipeline/.gitignore b/modules/ml-pipeline/src/pipeline/.gitignore index bf035d2..ce8309f 100644 --- a/modules/ml-pipeline/src/pipeline/.gitignore +++ b/modules/ml-pipeline/src/pipeline/.gitignore @@ -1,3 +1,4 @@ # Ignore dynaconf secret files .secrets.* +example.py diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index fcec7f7..be5ec5c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -19,3 +19,4 @@ default: excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 + ag_args_ensemble: {'num_folds_parallel': 2} diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 103168d..bcc53e5 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -18,30 +18,44 @@ def remove_starting_columns(df): return df -def remove_floor_height_ending(df): - # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] - # shows bottom 0.5 percentile is 1.665 - # So keep anything above this - df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True) - print("we in here") +def keep_negative_heat_change(df): + df = df[df["heat_demand_change"] < 0] return df -def remove_minimum_habitable_room_size(df): - # Need minimum of 6.5m per habitable room - df = df[ - df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5 - ].reset_index(drop=True) +def keep_non_negative_carbon_ending(df): + df = df[df["carbon_ending"] > 0] return df -def keep_flats(df): - df = df[df["property_type"] == "Flat"] +def keep_negative_carbon_change(df): + df = df[df["carbon_change"] < 0] return df -def keep_non_zero_rdsap(df): - df = df[df["rdsap_change"] != 0] +# TODO: Move to ETL pipeline +def remove_unreasonable_habitable_rooms(df): + """ + Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2 + """ + minimum_room_size_index = ( + df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5 + ) + df = df[minimum_room_size_index] + return df + + +def remove_top_1_percent_heat_demand(df): + # threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%'] + threshold_value = 860 + df = df[df["heat_demand_starting"] < threshold_value] + return df + + +def remove_top_1_percent_carbon(df): + # threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%'] + threshold_value = 18 + df = df[df["carbon_starting"] < threshold_value] return df @@ -54,10 +68,12 @@ def keep_non_zero_rdsap(df): # return df business_logic = { - # "keep_non_zero_rdsap": keep_non_zero_rdsap, - # "keep_flats": keep_flats, - # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, - # "remove_floor_height_ending": remove_floor_height_ending + "remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms, + "keep_negative_heat_change": keep_negative_heat_change, + "keep_negative_carbon_change": keep_negative_carbon_change, + "remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand, + "remove_top_1_percent_carbon": remove_top_1_percent_carbon, + "keep_non_negative_carbon_ending": keep_non_negative_carbon_ending, # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 643231a..2ca8890 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -1,23 +1,24 @@ """ After predictions, we may want to apply some post processing to the predictions """ + import pandas as pd def clip_predictions_to_minimum_value( - data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 + data: pd.DataFrame, + predictions: pd.Series, ) -> pd.Series: series_name = predictions.name predictions.name = "predictions" + predictions = predictions.astype(data["carbon_starting"].dtype) predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = ( - predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"] - ) - predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "sap_starting"] + minimum_value - ) + replace_index = predictions_df["predictions"] > predictions_df["carbon_starting"] + predictions_df.loc[replace_index, "predictions"] = predictions_df.loc[ + replace_index, "carbon_starting" + ] predictions_new = predictions_df["predictions"] predictions_new.name = series_name diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 4327e64..fecdcb0 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -18,12 +18,7 @@ default: prepare_data: input_dataclient_type: aws-s3 output_dataclient_type: local - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -33,9 +28,13 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: sap_ending + target: carbon_ending identifier_columns: ["uprn"] - drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"] + # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending"] + drop_columns: [ + "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending", "days_to_starting", "days_to_ending", + 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', + 'number_habitable_rooms', 'number_heated_rooms'] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 4fc572a..257261d 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel: models = { "SKLearnLinearRegression": SKLearnLinearRegression(), "SKLearnSVMRegression": SKLearnSVMRegression(), - "AutogluonAutoML": AutogluonAutoML() + "AutogluonAutoML": AutogluonAutoML(), # ADD OTHER MODELS HERE } @@ -151,6 +151,7 @@ class AutogluonAutoML: "excluded_model_types", "infer_limit", "infer_limit_batch_size", + "ag_args_ensemble", ] def load_model(self, path: Union[Path, str]) -> None: @@ -207,6 +208,7 @@ class AutogluonAutoML: excluded_model_types=model_hyperparameters["excluded_model_types"], infer_limit=model_hyperparameters["infer_limit"], infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], + ag_args_ensemble=model_hyperparameters["ag_args_ensemble"], ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index f15978f..81224d8 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -1,12 +1,23 @@ schema: '2.0' stages: + startup_cleanup: + cmd: python 0_startup_cleanup.py + deps: + - path: 0_startup_cleanup.py + hash: md5 + md5: b1b12f6b6393fbf8b83d23684df0a3d4 + size: 1220 + params: + configs/settings.yaml: + default.startup_cleanup.artefacts: ./data + default.startup_cleanup.metrics: ./metrics prepare_data: cmd: python 1_prepare_data.py deps: - path: 1_prepare_data.py hash: md5 - md5: 1793a35e71751d3c84f9affc67ecb9a8 - size: 4296 + md5: 11a3b8bfdfe199ab7ecc39ccc5652649 + size: 4298 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -14,13 +25,22 @@ stages: - carbon_change - rdsap_change - heat_demand_ending - - carbon_ending + - sap_ending + - days_to_starting + - days_to_ending + - number_habitable_rooms_starting + - number_habitable_rooms_ending + - number_heated_rooms_starting + - number_heated_rooms_ending + - number_habitable_rooms + - number_heated_rooms default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: sap_ending + default.feature_processor.feature_processor_config.target: carbon_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + default.prepare_data.data_filepath: + s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +49,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: 35d7daa8144434e188ba3b1da4bcf328.dir + size: 33946500 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +61,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: 35d7daa8144434e188ba3b1da4bcf328.dir + size: 33946500 nfiles: 2 params: configs/build_model.yaml: @@ -70,21 +90,23 @@ stages: - XT infer_limit: 0.05 infer_limit_batch_size: 10000 + ag_args_ensemble: + num_folds_parallel: 2 outs: - path: data/fit_predictions/ hash: md5 - md5: ede187e9d0bffdef054f573f3c2bd222.dir - size: 3578590 + md5: 19d033f5abfa9b064c3e52815e607ced.dir + size: 3927492 nfiles: 1 - path: data/model/ hash: md5 - md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir - size: 814720415 - nfiles: 31 + md5: f159d40353b01ffdcf1b1b490c019f1f.dir + size: 787748148 + nfiles: 32 - path: metrics/fit_metrics.json hash: md5 - md5: c45b84f12971a0156e4f3d85d3e725f5 - size: 218 + md5: e69d56ab9d82f23f2aa66001bd9bebbc + size: 229 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -94,13 +116,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir - size: 814720415 - nfiles: 31 + md5: f159d40353b01ffdcf1b1b490c019f1f.dir + size: 787748148 + nfiles: 32 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: 35d7daa8144434e188ba3b1da4bcf328.dir + size: 33946500 nfiles: 2 params: configs/settings.yaml: @@ -112,8 +134,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 5e60ca251af51de6fef3d0c659f8bb27.dir - size: 627416 + md5: 50d0c76fc56c6290babeff1c84750316.dir + size: 651956 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -124,13 +146,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 5e60ca251af51de6fef3d0c659f8bb27.dir - size: 627416 + md5: 50d0c76fc56c6290babeff1c84750316.dir + size: 651956 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 84fa631bd02686b052d6a7144eafd38e.dir - size: 43859225 + md5: 35d7daa8144434e188ba3b1da4bcf328.dir + size: 33946500 nfiles: 2 params: configs/settings.yaml: @@ -140,16 +162,5 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 033efa4d4044b6b6fc92dd37194727fa - size: 225 - startup_cleanup: - cmd: python 0_startup_cleanup.py - deps: - - path: 0_startup_cleanup.py - hash: md5 - md5: b1b12f6b6393fbf8b83d23684df0a3d4 - size: 1220 - params: - configs/settings.yaml: - default.startup_cleanup.artefacts: ./data - default.startup_cleanup.metrics: ./metrics + md5: 542b982d6aa9fe0fdb89611e4299cb1e + size: 228 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt index 0d259fb..258981d 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt index afad9be..2ab48e9 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt @@ -1,7 +1,7 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0 pyarrow==13.0.0 PyYAML==6.0.1 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt index d8c5907..2024d84 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt @@ -1,9 +1,10 @@ joblib==1.3.2 boto3==1.28.17 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 +ray==2.6.3 dynaconf==3.2.0 -alibi==0.9.4 +alibi==0.9.5 shap==0.42.1 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt index bbdc2fa..84452a3 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt @@ -1,4 +1,4 @@ boto3==1.28.41 -pandas==1.5.3 -autogluon==0.8.2 +pandas==2.1.4 +autogluon==1.0.0 dynaconf==3.2.0