diff --git a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py index 9d2fa68..43d0a0c 100644 --- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py +++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py @@ -79,6 +79,17 @@ def generate_scenario_predictions( scenario_data = pd.DataFrame() # Can have multiple scenario data files + if scenario_data_filepaths is None: + logger.info("No scenario data filepaths provided") + output_dataclient.save_data( + obj=scenario_data, location=comparison_output_filepath, save_config=None + ) + + output_dataclient.save_data( + obj=scenario_data, location=metrics_output_filepath, save_config=None + ) + return + for scenario_data_filepath in scenario_data_filepaths: scenario_data = pd.concat( [ diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 643231a..ce3b508 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -1,6 +1,7 @@ """ After predictions, we may want to apply some post processing to the predictions """ + import pandas as pd @@ -30,6 +31,6 @@ def clip_predictions_to_minimum_value( post_prediction_logic = { - "clip_predictions_to_minimum_value": clip_predictions_to_minimum_value, + # "clip_predictions_to_minimum_value": clip_predictions_to_minimum_value, # "round_predictions": round_predictions } diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml index 2df0cb6..a6a957a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -5,6 +5,6 @@ default: scenario_data_filepaths: # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet - - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index f42b2be..d23d514 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -18,7 +18,7 @@ default: prepare_data: input_dataclient_type: aws-s3 output_dataclient_type: local - data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-31-11-53-30/dataset_no_cleaning_records.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -32,9 +32,8 @@ default: identifier_columns: ["uprn"] # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] drop_columns: [ - "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending", - 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', - 'number_habitable_rooms', 'number_heated_rooms'] + 'extension_count', 'constituency', 'co2_emissions_current', 'co2_emissions_potential', 'energy_consumption_potential', 'environment_impact_potential', 'potential_energy_efficiency', 'energy_consumption_current', 'number_habitable_rooms', 'number_heated_rooms' + ] retain_features: null # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', # 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 104dc83..ef819fe 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -21,66 +21,23 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - - heat_demand_change - - carbon_change - - rdsap_change - - heat_demand_ending - - carbon_ending - - days_to_starting - - days_to_ending - - number_habitable_rooms_starting - - number_habitable_rooms_ending - - number_heated_rooms_starting - - number_heated_rooms_ending + - extension_count + - constituency + - co2_emissions_current + - co2_emissions_potential + - energy_consumption_potential + - environment_impact_potential + - potential_energy_efficiency + - energy_consumption_current - number_habitable_rooms - number_heated_rooms default.feature_processor.feature_processor_config.retain_features: - - uprn - - sap_starting - - hot_water_energy_eff_ending - - mainheat_energy_eff_ending - - constituency - - roof_energy_eff_ending - - walls_energy_eff_ending - - secondheat_description_ending - - property_type - - mainheatc_energy_eff_ending - - built_form - - walls_insulation_thickness_ending - - potential_energy_efficiency - - transaction_type_ending - - floor_thermal_transmittance_ending - - low_energy_lighting_ending - - heat_demand_starting - - photo_supply_ending - - carbon_starting - - walls_thermal_transmittance_ending - - roof_insulation_thickness_ending - - total_floor_area_ending - - number_open_fireplaces_ending - - windows_energy_eff_ending - - floor_height_ending - - extension_count_ending - - has_air_source_heat_pump_ending - - charging_system_ending - - construction_age_band - - glazed_type_ending - - roof_thermal_transmittance_ending - - floor_insulation_thickness_ending - - has_mains_gas_ending - - estimated_perimeter_starting - - energy_consumption_potential - - environment_impact_potential - - heater_type_ending - - multi_glaze_proportion_ending - - lighting_energy_eff_ending - - fixed_lighting_outlets_count default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: - s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet + s3://retrofit-data-dev/sap_change_model/2024-03-31-11-53-30/dataset_no_cleaning_records.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -89,8 +46,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 707102eff568f50acce74ee035505701.dir + size: 3105316 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -101,8 +58,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 707102eff568f50acce74ee035505701.dir + size: 3105316 nfiles: 2 params: configs/build_model.yaml: @@ -134,18 +91,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: de46250d454c4d713ab580b10ff3fd31.dir - size: 3349318 + md5: affa689b5330ba5f4e947ef1015fef71.dir + size: 454068 nfiles: 1 - path: data/model/ hash: md5 - md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir - size: 735951861 - nfiles: 35 + md5: 90d021a62015b48b47fbc4220851f26d.dir + size: 387078305 + nfiles: 37 - path: metrics/fit_metrics.json hash: md5 - md5: 8a952a5e884c268e6059357a627b9251 - size: 224 + md5: fcd16837efd83c4611b1d0d384536e91 + size: 226 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -155,13 +112,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir - size: 735951861 - nfiles: 35 + md5: 90d021a62015b48b47fbc4220851f26d.dir + size: 387078305 + nfiles: 37 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 707102eff568f50acce74ee035505701.dir + size: 3105316 nfiles: 2 params: configs/settings.yaml: @@ -173,8 +130,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir - size: 463563 + md5: bf9fc2e6ef8ae6428c134e7c1c4e5ac8.dir + size: 50978 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -185,13 +142,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir - size: 463563 + md5: bf9fc2e6ef8ae6428c134e7c1c4e5ac8.dir + size: 50978 nfiles: 1 - path: data/prepared_data hash: md5 - md5: efa416abea618ae6220a0c3d597603cf.dir - size: 44750997 + md5: 707102eff568f50acce74ee035505701.dir + size: 3105316 nfiles: 2 params: configs/settings.yaml: @@ -201,30 +158,29 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 9f863f47799d42c101eba3b03a179455 + md5: 25cef9cd80330e3b1060cb4cbdd1c945 size: 224 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: - path: 5_generate_scenarios.py hash: md5 - md5: a18f6c6ae2082f038df47386cf3e418e - size: 4896 + md5: 45f4dc81c380db433ddf56405d7683bd + size: 5273 params: configs/scenarios.yaml: default.scenarios: input_dataclient_type: aws-s3 output_dataclient_type: local scenario_data_filepaths: - - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 64e7db945ff655ae03c20c9845f19106 - size: 363 + md5: d41d8cd98f00b204e9800998ecf8427e + size: 0 - path: metrics/scenario_table.md hash: md5 - md5: d4f8afe07b774374aeaa48f1b7b8a5fc - size: 2133 + md5: d41d8cd98f00b204e9800998ecf8427e + size: 0