From 767997c38f2123704d7a19188b98242923e283fe Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 8 Jan 2026 22:59:09 +0000 Subject: [PATCH] test simple model --- .../pipeline/configs/post_prediction_logic.py | 3 +- .../src/pipeline/configs/settings.yaml | 147 +++++++++++++-- modules/ml-pipeline/src/pipeline/dvc.lock | 170 ++++++++++++++---- 3 files changed, 278 insertions(+), 42 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 643231a..ce3b508 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -1,6 +1,7 @@ """ After predictions, we may want to apply some post processing to the predictions """ + import pandas as pd @@ -30,6 +31,6 @@ def clip_predictions_to_minimum_value( post_prediction_logic = { - "clip_predictions_to_minimum_value": clip_predictions_to_minimum_value, + # "clip_predictions_to_minimum_value": clip_predictions_to_minimum_value, # "round_predictions": round_predictions } diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index a6b493e..b0181b9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -12,8 +12,7 @@ default: AWS_ACCESS_KEY_ID: minio AWS_SECRET_ACCESS_KEY: minio123 ENDPOINT_URL: http://localhost:9000 - local: - null + local: null prepare_data: input_dataclient_type: aws-s3 @@ -33,15 +32,143 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: sap_ending + target: sap_starting identifier_columns: ["uprn"] # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] - drop_columns: [ - "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending", - 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', - 'number_habitable_rooms', 'number_heated_rooms', 'lighting_cost_starting', - 'lighting_cost_ending', 'heating_cost_starting', 'heating_cost_ending', 'hot_water_cost_starting', 'hot_water_cost_ending', - 'floor_thermal_transmittance', 'floor_thermal_transmittance_ending', 'lodgement_date_starting', 'lodgement_date_ending',] + drop_columns: + [ + "sap_ending", + "potential_energy_efficiency", + "environment_impact_potential", + "energy_consumption_potential", + "co2_emissions_potential", + "heat_demand_change", + "carbon_change", + "rdsap_change", + "heat_demand_ending", + "carbon_ending", + "days_to_starting", + "days_to_ending", + "number_habitable_rooms_starting", + "number_habitable_rooms_ending", + "number_heated_rooms_starting", + "number_heated_rooms_ending", + "number_habitable_rooms", + "number_heated_rooms", + "lighting_cost_starting", + "lighting_cost_ending", + "heating_cost_starting", + "heating_cost_ending", + "hot_water_cost_starting", + "hot_water_cost_ending", + "floor_thermal_transmittance", + "floor_thermal_transmittance_ending", + "lodgement_date_starting", + "lodgement_date_ending", + "walls_thermal_transmittance_ending", + "walls_thermal_transmittance_unit_ending", + "is_filled_cavity_ending", + "is_as_built_ending", + "walls_is_assumed_ending", + "is_park_home_ending", + "walls_insulation_thickness_ending", + "external_insulation_ending", + "internal_insulation_ending", + "floor_insulation_thickness_ending", + "roof_thermal_transmittance_ending", + "is_at_rafters_ending", + "roof_insulation_thickness_ending", + "heater_type_ending", + "system_type_ending", + "thermostat_characteristics_ending", + "heating_scope_ending", + "energy_recovery_ending", + "hotwater_tariff_type_ending", + "extra_features_ending", + "chp_systems_ending", + "distribution_system_ending", + "no_system_present_ending", + "appliance_ending", + "has_radiators_ending", + "has_fan_coil_units_ending", + "has_pipes_in_screed_above_insulation_ending", + "has_pipes_in_insulated_timber_floor_ending", + "has_pipes_in_concrete_slab_ending", + "has_boiler_ending", + "has_air_source_heat_pump_ending", + "has_room_heaters_ending", + "has_electric_storage_heaters_ending", + "has_warm_air_ending", + "has_electric_underfloor_heating_ending", + "has_electric_ceiling_heating_ending", + "has_community_scheme_ending", + "has_ground_source_heat_pump_ending", + "has_no_system_present_ending", + "has_portable_electric_heaters_ending", + "has_water_source_heat_pump_ending", + "has_electric_heat_pump_ending", + "has_micro-cogeneration_ending", + "has_solar_assisted_heat_pump_ending", + "has_exhaust_source_heat_pump_ending", + "has_community_heat_pump_ending", + "has_hot-water-only_ending", + "has_electric_ending", + "has_mains_gas_ending", + "has_wood_logs_ending", + "has_coal_ending", + "has_oil_ending", + "has_wood_pellets_ending", + "has_anthracite_ending", + "has_dual_fuel_mineral_and_wood_ending", + "has_smokeless_fuel_ending", + "has_lpg_ending", + "has_b30k_ending", + "has_mineral_and_wood_ending", + "has_dual_fuel_appliance_ending", + "has_electricaire_ending", + "has_assumed_for_most_rooms_ending", + "has_underfloor_heating_ending", + "thermostatic_control_ending", + "charging_system_ending", + "switch_system_ending", + "no_control_ending", + "dhw_control_ending", + "community_heating_ending", + "multiple_room_thermostats_ending", + "auxiliary_systems_ending", + "trvs_ending", + "rate_control_ending", + "glazing_type_ending", + "fuel_type_ending", + "main-fuel_tariff_type_ending", + "is_community_ending", + "no_individual_heating_or_community_network_ending", + "complex_fuel_type_ending", + "mechanical_ventilation_ending", + "secondheat_description_ending", + "glazed_type_ending", + "multi_glaze_proportion_ending", + "low_energy_lighting_ending", + "number_open_fireplaces_ending", + "solar_water_heating_flag_ending", + "photo_supply_ending", + "transaction_type_ending", + "energy_tariff_ending", + "extension_count_ending", + "total_floor_area_ending", + "floor_height_ending", + "hot_water_energy_eff_ending", + "floor_energy_eff_ending", + "windows_energy_eff_ending", + "walls_energy_eff_ending", + "sheating_energy_eff_ending", + "roof_energy_eff_ending", + "mainheat_energy_eff_ending", + "mainheatc_energy_eff_ending", + "lighting_energy_eff_ending", + "is_post_sap10_ending", + "estimated_perimeter_ending", + ] retain_features: null # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', # 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', @@ -82,4 +209,4 @@ default: dev: generate_predictions: - input_dataclient_type: aws-s3 + input_dataclient_type: aws-s3 diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index eff64a2..bfb1df7 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -21,6 +21,11 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: + - sap_ending + - potential_energy_efficiency + - environment_impact_potential + - energy_consumption_potential + - co2_emissions_potential - heat_demand_change - carbon_change - rdsap_change @@ -44,25 +49,128 @@ stages: - floor_thermal_transmittance_ending - lodgement_date_starting - lodgement_date_ending + - walls_thermal_transmittance_ending + - walls_thermal_transmittance_unit_ending + - is_filled_cavity_ending + - is_as_built_ending + - walls_is_assumed_ending + - is_park_home_ending + - walls_insulation_thickness_ending + - external_insulation_ending + - internal_insulation_ending + - floor_insulation_thickness_ending + - roof_thermal_transmittance_ending + - is_at_rafters_ending + - roof_insulation_thickness_ending + - heater_type_ending + - system_type_ending + - thermostat_characteristics_ending + - heating_scope_ending + - energy_recovery_ending + - hotwater_tariff_type_ending + - extra_features_ending + - chp_systems_ending + - distribution_system_ending + - no_system_present_ending + - appliance_ending + - has_radiators_ending + - has_fan_coil_units_ending + - has_pipes_in_screed_above_insulation_ending + - has_pipes_in_insulated_timber_floor_ending + - has_pipes_in_concrete_slab_ending + - has_boiler_ending + - has_air_source_heat_pump_ending + - has_room_heaters_ending + - has_electric_storage_heaters_ending + - has_warm_air_ending + - has_electric_underfloor_heating_ending + - has_electric_ceiling_heating_ending + - has_community_scheme_ending + - has_ground_source_heat_pump_ending + - has_no_system_present_ending + - has_portable_electric_heaters_ending + - has_water_source_heat_pump_ending + - has_electric_heat_pump_ending + - has_micro-cogeneration_ending + - has_solar_assisted_heat_pump_ending + - has_exhaust_source_heat_pump_ending + - has_community_heat_pump_ending + - has_hot-water-only_ending + - has_electric_ending + - has_mains_gas_ending + - has_wood_logs_ending + - has_coal_ending + - has_oil_ending + - has_wood_pellets_ending + - has_anthracite_ending + - has_dual_fuel_mineral_and_wood_ending + - has_smokeless_fuel_ending + - has_lpg_ending + - has_b30k_ending + - has_mineral_and_wood_ending + - has_dual_fuel_appliance_ending + - has_electricaire_ending + - has_assumed_for_most_rooms_ending + - has_underfloor_heating_ending + - thermostatic_control_ending + - charging_system_ending + - switch_system_ending + - no_control_ending + - dhw_control_ending + - community_heating_ending + - multiple_room_thermostats_ending + - auxiliary_systems_ending + - trvs_ending + - rate_control_ending + - glazing_type_ending + - fuel_type_ending + - main-fuel_tariff_type_ending + - is_community_ending + - no_individual_heating_or_community_network_ending + - complex_fuel_type_ending + - mechanical_ventilation_ending + - secondheat_description_ending + - glazed_type_ending + - multi_glaze_proportion_ending + - low_energy_lighting_ending + - number_open_fireplaces_ending + - solar_water_heating_flag_ending + - photo_supply_ending + - transaction_type_ending + - energy_tariff_ending + - extension_count_ending + - total_floor_area_ending + - floor_height_ending + - hot_water_energy_eff_ending + - floor_energy_eff_ending + - windows_energy_eff_ending + - walls_energy_eff_ending + - sheating_energy_eff_ending + - roof_energy_eff_ending + - mainheat_energy_eff_ending + - mainheatc_energy_eff_ending + - lighting_energy_eff_ending + - is_post_sap10_ending + - estimated_perimeter_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: sap_ending + default.feature_processor.feature_processor_config.target: sap_starting default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/2025-11-02-09-32-42/dataset_rooms.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local - default.prepare_data.output_test_filepath: + default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet - default.prepare_data.output_train_filepath: + default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet default.prepare_data.train_proportion: 0.9 outs: - path: data/prepared_data/ hash: md5 - md5: d798b73fafe6d59c96c0216baeaf085a.dir - size: 46090520 + md5: dd0f94a24ee81f56ee81ec2e2cd16930.dir + size: 28011379 nfiles: 3 build_model: cmd: python 2_build_model.py @@ -73,8 +181,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: d798b73fafe6d59c96c0216baeaf085a.dir - size: 46090520 + md5: dd0f94a24ee81f56ee81ec2e2cd16930.dir + size: 28011379 nfiles: 3 params: configs/build_model.yaml: @@ -175,18 +283,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 2d3627b9752e0eb6988d655cc76cb871.dir - size: 3474407 + md5: 32ca8d6ae9247cc1f14d331d42e9a906.dir + size: 3457647 nfiles: 1 - path: data/model/ hash: md5 - md5: e4279fd1aff989b128e7477ad7e02d81.dir - size: 790249675 + md5: de9d3888df7a1c5afffe54625569d4e0.dir + size: 660397572 nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: e45c166e089965e9c17d9b4a6656d6d6 - size: 225 + md5: 38875a7edc13ace91ae9478bee9e070f + size: 227 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -196,28 +304,28 @@ stages: size: 2464 - path: data/model hash: md5 - md5: e4279fd1aff989b128e7477ad7e02d81.dir - size: 790249675 + md5: de9d3888df7a1c5afffe54625569d4e0.dir + size: 660397572 nfiles: 31 - path: data/prepared_data hash: md5 - md5: d798b73fafe6d59c96c0216baeaf085a.dir - size: 46090520 + md5: dd0f94a24ee81f56ee81ec2e2cd16930.dir + size: 28011379 nfiles: 3 params: configs/settings.yaml: default.generate_predictions.input_dataclient_type: local default.generate_predictions.output_dataclient_type: local default.generate_predictions.predictions_column_name: predictions - default.generate_predictions.predictions_output_filepath: + default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet - default.generate_predictions.test_data_filepath: + default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet outs: - path: data/predictions/ hash: md5 - md5: fdebcc5ba775c2b416e33e8775dd450a.dir - size: 484710 + md5: 822173b06528b9520a47d84c16b39eb3.dir + size: 480674 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -228,13 +336,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: fdebcc5ba775c2b416e33e8775dd450a.dir - size: 484710 + md5: 822173b06528b9520a47d84c16b39eb3.dir + size: 480674 nfiles: 1 - path: data/prepared_data hash: md5 - md5: d798b73fafe6d59c96c0216baeaf085a.dir - size: 46090520 + md5: dd0f94a24ee81f56ee81ec2e2cd16930.dir + size: 28011379 nfiles: 3 params: configs/settings.yaml: @@ -244,8 +352,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: a5f8e795d87356eaff446ae7006a47fe - size: 224 + md5: c4d89ad0cc8b17cdff7ef52bc5260a3c + size: 226 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: @@ -265,9 +373,9 @@ stages: outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 86c9a8f2520cac8ed0796d62c03de278 - size: 356 + md5: 35bbf37093438857a59d63ccb3611549 + size: 370 - path: metrics/scenario_table.md hash: md5 - md5: 686f3f5d966c82c0f68baaaa74617aa1 + md5: 424014b8b8bd057489e86abe68e0d335 size: 872