diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index ed2e4a3..d29d7ac 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -192,6 +192,7 @@ def remove_high_ratio_of_area_to_rooms(df): # Remove top 0.05% of area-to-heated-rooms df = df[df['area-to-heated-rooms'] < df['area-to-heated-rooms'].quantile(0.9995)].reset_index(drop=True) + df = df.drop(columns=['area-to-heated-rooms']) return df def add_estimate_annual_kwh(df): diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 28b5f2c..b484049 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -37,7 +37,7 @@ default: target: heating_kwh identifier_columns: ["uprn"] drop_columns: ["hot_water_kwh", 'lodgement-datetime', 'lodgement-date', 'number-habitable-rooms', 'local-authority', 'posttown', 'address', 'inspection-date', - "county", "constituency-label", 'address2', 'uprn-source', 'postcode', 'address1',] + "county", "constituency-label", 'address2', 'uprn-source', 'postcode', 'address1', 'mainheat-env-eff', 'environment-impact-current', 'hot-water-env-eff', 'floor-env-eff'] retain_features: null generate_predictions: diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index e1f65f5..536a3bc 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -35,6 +35,10 @@ stages: - uprn-source - postcode - address1 + - mainheat-env-eff + - environment-impact-current + - hot-water-env-eff + - floor-env-eff default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 @@ -50,8 +54,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: f506f1f059945c0f014c3f505a63726c.dir - size: 30388447 + md5: 382d5d02772d4ead3a31fa9420c03417.dir + size: 29570807 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -62,8 +66,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: f506f1f059945c0f014c3f505a63726c.dir - size: 30388447 + md5: 382d5d02772d4ead3a31fa9420c03417.dir + size: 29570807 nfiles: 2 params: configs/build_model.yaml: @@ -95,18 +99,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 9a2abeada227b8bb4c13d6c745bef581.dir - size: 1547064 + md5: 3ea5e827470bb96408fa70bc45ed6b58.dir + size: 1545844 nfiles: 1 - path: data/model/ hash: md5 - md5: 43b72f9284e92842cbc82bc7cc0950e2.dir - size: 506201607 - nfiles: 36 + md5: cff0bc3b08dfb48fff929bb4991ea2f5.dir + size: 291336375 + nfiles: 35 - path: metrics/fit_metrics.json hash: md5 - md5: 4a496483bffad3efe671f29110729e48 - size: 221 + md5: 63b660c30b855ee0d86b0c1be4ad537e + size: 220 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -116,13 +120,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 43b72f9284e92842cbc82bc7cc0950e2.dir - size: 506201607 - nfiles: 36 + md5: cff0bc3b08dfb48fff929bb4991ea2f5.dir + size: 291336375 + nfiles: 35 - path: data/prepared_data hash: md5 - md5: f506f1f059945c0f014c3f505a63726c.dir - size: 30388447 + md5: 382d5d02772d4ead3a31fa9420c03417.dir + size: 29570807 nfiles: 2 params: configs/settings.yaml: @@ -134,7 +138,7 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 88832d623c3e437eaec221307ac33aae.dir + md5: 88c5ad48fd6035600135a4541f2811a8.dir size: 163584 nfiles: 1 generate_metrics: @@ -146,13 +150,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 88832d623c3e437eaec221307ac33aae.dir + md5: 88c5ad48fd6035600135a4541f2811a8.dir size: 163584 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f506f1f059945c0f014c3f505a63726c.dir - size: 30388447 + md5: 382d5d02772d4ead3a31fa9420c03417.dir + size: 29570807 nfiles: 2 params: configs/settings.yaml: @@ -162,8 +166,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: f2783bdec0f0974b6d799609c6189467 - size: 222 + md5: f3fd84bd242e9f806aaedbb560f2ac7e + size: 219 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: