diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index cc5623d..918abd6 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -34,7 +34,7 @@ default: subsample_seed: 0 target: sap_ending identifier_columns: ["uprn"] - drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_change", "carbon_ending"] + drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 5e7bfe5..82c8608 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -13,7 +13,7 @@ stages: - heat_demand_change - carbon_change - rdsap_change - - heat_demand_change + - heat_demand_ending - carbon_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir - size: 40122363 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir - size: 40122363 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/build_model.yaml: @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 6a737d44dae68be2e75d6edb7f04f3ca.dir - size: 334981921 + md5: 6265dafedf579905c31c676e81c2a9c7.dir + size: 344212462 nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 89ba30b943c911e24b13b4370db12d18 - size: 225 + md5: 5cd6b92af1b1df753e20e9ea33629c4d + size: 224 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 6a737d44dae68be2e75d6edb7f04f3ca.dir - size: 334981921 + md5: 6265dafedf579905c31c676e81c2a9c7.dir + size: 344212462 nfiles: 24 - path: data/prepared_data hash: md5 - md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir - size: 40122363 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: c9a0ad3ef06f23d5d507bbec0ba86e98.dir - size: 362994 + md5: b130faf5117b06897b2deed97f5868ee.dir + size: 367038 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: c9a0ad3ef06f23d5d507bbec0ba86e98.dir - size: 362994 + md5: b130faf5117b06897b2deed97f5868ee.dir + size: 367038 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 3d1f4d54c7b520531e4f5ff5f33e34d8.dir - size: 40122363 + md5: 5d29397fcafe6b3dc4d51ffaf1e55239.dir + size: 39303409 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: fa40071006901c4335b5dbd567c9d9b3 - size: 226 + md5: 3900cc1697d6d7308728b3d5b3025f85 + size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 6c29308..e1d33a6 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -190,28 +190,35 @@ prediction_analysis_params = settings.prediction_analysis model = model_factory(build_model_params["model_type"]) model.load_model(build_model_params["model_save_filepath"]) dataclient_type = prediction_analysis_params["dataclient_type"] -dataclient = dataclient_factory( - dataclient_type=dataclient_type, - dataclient_config=client_params[dataclient_type], -) +# dataclient_type = 'aws-s3' +# dataclient = dataclient_factory( +# dataclient_type=dataclient_type, +# dataclient_config=client_params[dataclient_type], +# ) +# data = dataclient.load_data("s3://retrofit-data-dev/sap_change_model/dataset.parquet") target = feature_process_params["feature_processor_config"]["target"] predictions_column_name = generate_predictions_params["predictions_column_name"] output_test_filepath = prepare_data_params["output_test_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] -test_df = dataclient.load_data(output_test_filepath) -predictions = dataclient.load_data(predictions_output_filepath) +# score_data = dataclient.load_data("s3://retrofit-data-dev/carbon_change_predictions/51/2023-11-28T21:01:21.869339.parquet") + + +local_dataclient = dataclient_factory( + dataclient_type="local", + dataclient_config=client_params["local"], +) +test_df = local_dataclient.load_data(output_test_filepath) +predictions = local_dataclient.load_data(predictions_output_filepath) mix_df = pd.concat([test_df.copy(), predictions], axis=1) mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df = mix_df.sort_values("residual", ascending=False) -cosine_similarity_df = mix_df[ - mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"]) -] +cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])] from sklearn.metrics.pairwise import cosine_similarity -row_index = 20695 +row_index = 0 from sklearn.preprocessing import LabelEncoder @@ -224,8 +231,18 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ feature_vector = cosine_similarity_df.loc[[row_index]] cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) - -similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5) -similar_index = similar_df.index +similar_index = ( + cosine_similarity_df.sort_values("cosine", ascending=False).head(15).index +) check_df = mix_df.loc[similar_index] + +columns_to_check = [ + "LOW_ENERGY_LIGHTING_ENDING", + "walls_thermal_transmittance_ENDING", + "floor_thermal_transmittance_ENDING", + "roof_thermal_transmittance_ENDING", + "roof_insulation_thickness_ENDING", +] + +cosine_similarity_df = mix_df[columns_to_check]