From f9b0b6112c351aca229baa9173ec401a431704be Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 9 Oct 2023 15:44:37 +0000 Subject: [PATCH 1/4] add some processing ocde --- .../src/pipeline/configs/analysis.yaml | 2 +- .../src/pipeline/configs/build_model.yaml | 2 +- .../configs/feature_processor_logic.py | 25 +++++++++++++++++++ modules/ml-pipeline/src/pipeline/eda.py | 10 ++++---- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml index 5c6e749..725660b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/analysis.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/analysis.yaml @@ -13,4 +13,4 @@ default: dataclient_type: local nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower - row_index: [0, 10, 20] # index of an example datapoint + row_index: [20695, 50243, 7653] # index of an example datapoint diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index d296e6a..bd684e9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 180 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index c32d2fe..4943f6b 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -18,6 +18,28 @@ def remove_starting_columns(df): return df +def remove_floor_height_ending(df): + # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] + # shows bottom 0.5 percentile is 1.665 + # So keep anything above this + df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True) + print("we in here") + return df + + +def remove_minimum_habitable_room_size(df): + # Need minimum of 6.5m per habitable room + df = df[ + df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5 + ].reset_index(drop=True) + return df + + +def keep_flats(df): + df = df[df["PROPERTY_TYPE"] == "Flat"] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -27,6 +49,9 @@ def remove_starting_columns(df): # return df business_logic = { + # "keep_flats": keep_flats, + # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, + # "remove_floor_height_ending": remove_floor_height_ending # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 2fdd8be..6c29308 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -207,11 +207,11 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df = mix_df.sort_values("residual", ascending=False) cosine_similarity_df = mix_df[ - mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"]) + mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"]) ] from sklearn.metrics.pairwise import cosine_similarity -row_index = 58199 +row_index = 20695 from sklearn.preprocessing import LabelEncoder @@ -224,8 +224,8 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ feature_vector = cosine_similarity_df.loc[[row_index]] cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) -similar_index = ( - cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index -) + +similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5) +similar_index = similar_df.index check_df = mix_df.loc[similar_index] From 7589977cda6530f2c6837b6a68e9035122449ca2 Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Thu, 12 Oct 2023 10:19:22 +0100 Subject: [PATCH 2/4] Update Makefile --- modules/ml-pipeline/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 5c5d563..6ccb4c4 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -10,15 +10,15 @@ init: dev-conda dev-conda: # conda deactivate || echo "Not in conda environment" # conda remove --name $CONDA_ENV --all -y || echo "No environment created previously" - conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y + conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda init bash - conda run -vvvv -n $CONDA_ENV pip install --upgrade pip - conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt - conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt - conda run -vvvv -n $CONDA_ENV pre-commit install - conda run -vvvv -n $CONDA_ENV pip install ipykernel + conda run -vvvv -n ${CONDA_ENV} pip install --upgrade pip + conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt + conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt + conda run -vvvv -n ${CONDA_ENV} pre-commit install + conda run -vvvv -n ${CONDA_ENV} pip install ipykernel echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" - echo "conda activate $CONDA_ENV" + echo "conda activate ${CONDA_ENV}" .PHONY: dev-pyenv From 96153f82489c4aa09008262589840b98c770fccf Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 17 Oct 2023 03:08:01 +0100 Subject: [PATCH 3/4] Update Makefile --- modules/ml-pipeline/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 6ccb4c4..0bef7d6 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -9,14 +9,14 @@ init: dev-conda .PHONY: dev-conda dev-conda: # conda deactivate || echo "Not in conda environment" - # conda remove --name $CONDA_ENV --all -y || echo "No environment created previously" + # conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously" conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda init bash - conda run -vvvv -n ${CONDA_ENV} pip install --upgrade pip - conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt - conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt - conda run -vvvv -n ${CONDA_ENV} pre-commit install - conda run -vvvv -n ${CONDA_ENV} pip install ipykernel + conda run -v -n ${CONDA_ENV} pip install --upgrade pip + conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt + conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt + conda run -v -n ${CONDA_ENV} pre-commit install + conda run -v -n ${CONDA_ENV} pip install ipykernel echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "conda activate ${CONDA_ENV}" From 790c3a9456eec4b05e831a99ed80a6963aeba0c9 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 18 Oct 2023 13:27:25 +0000 Subject: [PATCH 4/4] use test dataset --- .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 42 +++++++++---------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index bd684e9..d296e6a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 180 + time_limit: 4000 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index ce7ed2c..9333c46 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index c499874..16eb857 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: SAP_ENDING default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd75be9fecff0c647792dd2db648085c.dir + size: 37056053 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 5359 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd75be9fecff0c647792dd2db648085c.dir + size: 37056053 nfiles: 2 params: configs/build_model.yaml: @@ -66,13 +66,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 + md5: 7a5527f779efcb1a7db068148b6bcc45.dir + size: 422448184 nfiles: 27 - path: metrics/fit_metrics.json hash: md5 - md5: 2bb16ac67de8778fbc08171d562b34d5 - size: 184 + md5: 77790bb9485c04c77125e361921c3774 + size: 225 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -82,13 +82,13 @@ stages: size: 3028 - path: data/model hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 + md5: 7a5527f779efcb1a7db068148b6bcc45.dir + size: 422448184 nfiles: 27 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd75be9fecff0c647792dd2db648085c.dir + size: 37056053 nfiles: 2 params: configs/settings.yaml: @@ -100,8 +100,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: 28d2876e6c6d5cc64844ecc1d6ac40b2.dir + size: 346687 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -112,13 +112,13 @@ stages: size: 4487 - path: data/predictions hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: 28d2876e6c6d5cc64844ecc1d6ac40b2.dir + size: 346687 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: cd75be9fecff0c647792dd2db648085c.dir + size: 37056053 nfiles: 2 params: configs/settings.yaml: @@ -128,8 +128,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 2e13ae67759a64261d03224f1c0d4bf4 - size: 185 + md5: 7afd04d656dc83ad6aa942d9c63f5b4e + size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: