diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 48375c3..6e34d36 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -2,7 +2,7 @@ name: Sap Change Model Deploy on: push: - branches: [ sap-dev, sap-prod ] + branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod] jobs: deploy: diff --git a/modules/ml-pipeline/.dvc/.gitignore b/modules/ml-pipeline/.dvc/.gitignore deleted file mode 100644 index 528f30c..0000000 --- a/modules/ml-pipeline/.dvc/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/config.local -/tmp -/cache diff --git a/modules/ml-pipeline/.dvc/config b/modules/ml-pipeline/.dvc/config deleted file mode 100644 index 03ccfbc..0000000 --- a/modules/ml-pipeline/.dvc/config +++ /dev/null @@ -1,2 +0,0 @@ -['remote "myremote"'] - url = /tmp/dvcstore diff --git a/modules/ml-pipeline/.dvcignore b/modules/ml-pipeline/.dvcignore deleted file mode 100644 index 5197305..0000000 --- a/modules/ml-pipeline/.dvcignore +++ /dev/null @@ -1,3 +0,0 @@ -# Add patterns of files dvc should ignore, which could improve -# the performance. Learn more at -# https://dvc.org/doc/user-guide/dvcignore diff --git a/modules/ml-pipeline/.gto b/modules/ml-pipeline/.gto deleted file mode 100644 index c44c86e..0000000 --- a/modules/ml-pipeline/.gto +++ /dev/null @@ -1,2 +0,0 @@ -# .gto config file -stages: [dev, stage, prod] # list of allowed Stages diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index f3504a7..7ca4951 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -67,7 +67,6 @@ def build_model( test_data: Union[pd.DataFrame, None] = None, pipeline_mode: bool = False, ): - logger.info("--- Loading Data for build process ---") if train_data is None: diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 1ebb62d..4c72487 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 4943f6b..026191c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -9,11 +9,11 @@ Business Logic dict + functions def remove_starting_columns(df): keep_column_index = [ - False if col_name.endswith("_STARTING") else True + False if col_name.endswith("_starting") else True for col_name in list(df.columns) ] keep_columns = df.columns[keep_column_index].to_list() - keep_columns.append("SAP_STARTING") + keep_columns.append("sap_starting") df = df[keep_columns] return df @@ -22,7 +22,7 @@ def remove_floor_height_ending(df): # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] # shows bottom 0.5 percentile is 1.665 # So keep anything above this - df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True) + df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True) print("we in here") return df @@ -30,13 +30,13 @@ def remove_floor_height_ending(df): def remove_minimum_habitable_room_size(df): # Need minimum of 6.5m per habitable room df = df[ - df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5 + df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5 ].reset_index(drop=True) return df def keep_flats(df): - df = df[df["PROPERTY_TYPE"] == "Flat"] + df = df[df["property_type"] == "Flat"] return df diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index b85d3a4..c1b8ebd 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -12,9 +12,9 @@ def clip_predictions_to_minimum_value( predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"] + replace_index = predictions_df["sap_starting"] + 1 > predictions_df["predictions"] predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value + predictions_df.loc[replace_index, "sap_starting"] + minimum_value ) predictions_new = predictions_df["predictions"] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 9333c46..918abd6 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,8 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -31,9 +32,9 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: SAP_ENDING - identifier_columns: ["UPRN"] - drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + target: sap_ending + identifier_columns: ["uprn"] + drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 20dd532..82c8608 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -10,17 +10,17 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - - HEAT_DEMAND_CHANGE - - CARBON_CHANGE - - RDSAP_CHANGE - - HEAT_DEMAND_ENDING - - CARBON_ENDING + - heat_demand_change + - carbon_change + - rdsap_change + - heat_demand_ending + - carbon_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: SAP_ENDING + default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_refactor.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,20 +29,20 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: 7b79f280b8b0d5bc6f07669e7cc37c6a - size: 4150 + md5: b824822475c222521516493e68eef9c5 + size: 4149 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: - KNN @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: f2999107de7572ea5ff0f2d774fa83b8.dir - size: 424943352 - nfiles: 27 + md5: 6265dafedf579905c31c676e81c2a9c7.dir + size: 344212462 + nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 9537e7ebc2eb32b421a7cabd2005f00b - size: 223 + md5: 5cd6b92af1b1df753e20e9ea33629c4d + size: 224 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: f2999107de7572ea5ff0f2d774fa83b8.dir - size: 424943352 - nfiles: 27 + md5: 6265dafedf579905c31c676e81c2a9c7.dir + size: 344212462 + nfiles: 24 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: f4439a56669f84bc51a9fcb4cd08353f.dir - size: 346539 + md5: b130faf5117b06897b2deed97f5868ee.dir + size: 367038 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: f4439a56669f84bc51a9fcb4cd08353f.dir - size: 346539 + md5: b130faf5117b06897b2deed97f5868ee.dir + size: 367038 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 6bfdb621b608648c017bf2323f7b5052.dir - size: 37048968 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/settings.yaml: @@ -130,7 +130,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 357904cf106279be5a578e8faefa5d80 + md5: 3900cc1697d6d7308728b3d5b3025f85 size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 6c29308..e1d33a6 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -190,28 +190,35 @@ prediction_analysis_params = settings.prediction_analysis model = model_factory(build_model_params["model_type"]) model.load_model(build_model_params["model_save_filepath"]) dataclient_type = prediction_analysis_params["dataclient_type"] -dataclient = dataclient_factory( - dataclient_type=dataclient_type, - dataclient_config=client_params[dataclient_type], -) +# dataclient_type = 'aws-s3' +# dataclient = dataclient_factory( +# dataclient_type=dataclient_type, +# dataclient_config=client_params[dataclient_type], +# ) +# data = dataclient.load_data("s3://retrofit-data-dev/sap_change_model/dataset.parquet") target = feature_process_params["feature_processor_config"]["target"] predictions_column_name = generate_predictions_params["predictions_column_name"] output_test_filepath = prepare_data_params["output_test_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] -test_df = dataclient.load_data(output_test_filepath) -predictions = dataclient.load_data(predictions_output_filepath) +# score_data = dataclient.load_data("s3://retrofit-data-dev/carbon_change_predictions/51/2023-11-28T21:01:21.869339.parquet") + + +local_dataclient = dataclient_factory( + dataclient_type="local", + dataclient_config=client_params["local"], +) +test_df = local_dataclient.load_data(output_test_filepath) +predictions = local_dataclient.load_data(predictions_output_filepath) mix_df = pd.concat([test_df.copy(), predictions], axis=1) mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df = mix_df.sort_values("residual", ascending=False) -cosine_similarity_df = mix_df[ - mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"]) -] +cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])] from sklearn.metrics.pairwise import cosine_similarity -row_index = 20695 +row_index = 0 from sklearn.preprocessing import LabelEncoder @@ -224,8 +231,18 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ feature_vector = cosine_similarity_df.loc[[row_index]] cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) - -similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5) -similar_index = similar_df.index +similar_index = ( + cosine_similarity_df.sort_values("cosine", ascending=False).head(15).index +) check_df = mix_df.loc[similar_index] + +columns_to_check = [ + "LOW_ENERGY_LIGHTING_ENDING", + "walls_thermal_transmittance_ENDING", + "floor_thermal_transmittance_ENDING", + "roof_thermal_transmittance_ENDING", + "roof_insulation_thickness_ENDING", +] + +cosine_similarity_df = mix_df[columns_to_check] diff --git a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt index 91cb005..a2b9531 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt @@ -1,4 +1,4 @@ -dvc==3.18.0 -dvc-s3==2.23.0 -gto==1.0.4 -pyOpenSSL==23.2.0 +dvc==3.36.0 +dvc-s3==3.0.1 +gto==1.6.1 +pyOpenSSL==23.3.0