From 3227399d2e618a61663a54a939ba987b9f9dd804 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sat, 13 Jul 2024 09:34:23 +0100 Subject: [PATCH] add initial hotwater kwh model --- .github/workflows/Deploy.yml | 2 +- .github/workflows/MLPipelinePostMerge.yml | 2 +- .github/workflows/MLPipelinePullRequest.yml | 2 +- .../configs/feature_processor_logic.py | 119 ++++++++++++++++++ .../src/pipeline/configs/settings.yaml | 12 +- modules/ml-pipeline/src/pipeline/dvc.lock | 65 ++++------ 6 files changed, 152 insertions(+), 50 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index c1b21f6..7540a2c 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -2,7 +2,7 @@ name: Sap Change Model Deploy on: push: - branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod, lighting-dev, lighting-prod ] + branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod, hotwaterkwh-dev, hotwaterkwh-prod] jobs: deploy: diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 78ea05a..9941cce 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -13,7 +13,7 @@ on: - "sap-dev" - "heat-dev" - "carbon-dev" - - "lighting-dev" + - "hotwaterkwh-dev" permissions: write-all diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index aa701db..d06df5f 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -5,7 +5,7 @@ on: # branches: # - "model-**" pull_request: - branches: ["sap-dev", "heat-dev", "carbon-dev", "lighting-dev"] + branches: ["sap-dev", "heat-dev", "carbon-dev", "hotwaterkwh-dev"] label: types: ["created", "edited"] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 103168d..48a28bf 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -5,7 +5,18 @@ During the feature processor step, we can apply additional business logic and fe """ Business Logic dict + functions """ +import pandas as pd +import numpy as np +import boto3 +import msgpack +s3 = boto3.resource('s3') + +# Get the MessagePack data from S3 +obj = s3.Object("retrofit-data-dev", "cleaned_epc_data/cleaned.bson") +cleaned = obj.get()['Body'].read() + +cleaned = msgpack.unpackb(cleaned, raw=False) def remove_starting_columns(df): keep_column_index = [ @@ -44,6 +55,112 @@ def keep_non_zero_rdsap(df): df = df[df["rdsap_change"] != 0] return df +def remove_hotwaterkwh_bottom_percentile(df, percentile=0.0001): + df = df[df["hot_water_kwh"] > df["hot_water_kwh"].quantile(percentile)] + return df + +def add_features_from_code(df): + + FEATURES = { + "heating_kwh": [ + "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", + "heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms", + "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", "property-type", + "built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff", + "walls-energy-eff", + "roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description", "flat-top-storey", + "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", + "low-energy-lighting", "environment-impact-current", "energy-tariff", + "county", "construction-age-band", "co2-emissions-current", + ], + "hot_water_kwh": [ + "lodgement-year", "lodgement-month", + "current-energy-efficiency", + "energy-consumption-current", + "hot-water-cost-current", + "total-floor-area", "number-heated-rooms", + "hotwater-description", "hot-water-energy-eff", "main-fuel", "property-type", "built-form", + "co2-emissions-current", + ] + } + CATEGORICAL_COLUMNS = [ + "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms", + "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", "built-form", + "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff", + "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description", + "county", + "windows-description", "windows-energy-eff", "flat-top-storey", + "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", + "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating" + ] + + NUMERICAL_COLUMNS = list({ + x for x in FEATURES["heating_kwh"] + FEATURES["hot_water_kwh"] + if x not in CATEGORICAL_COLUMNS + }) + + + """Performs feature engineering on the dataset.""" + df["lodgement-date"] = pd.to_datetime(df["lodgement-date"]) + df["lodgement-year"] = df["lodgement-date"].dt.year + df["lodgement-month"] = df["lodgement-date"].dt.month + + # For walls, roof, floor description where we have average thermal transmittance, to avoid too many categories + # we group them + ranges = { + "lessthan 0.1": (0, 0.1), + "0.1 - 0.3": (0.1, 0.3), + "0.3 - 0.5": (0.3, 0.5), + "morethan 0.5": (0.5, 2.5), + } + + # Generate the lookup table + thermal_transmittance_lookup_table = [] + for i in range(1, 251): + value = i / 100 + for label, (low, high) in ranges.items(): + if low < value <= high: + thermal_transmittance_lookup_table.append({"from": value, "to": label}) + break + + # Convert to DataFrame for display + thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table) + thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) + + # Apply the lookup table to the data + for feature in ["walls-description", "roof-description", "floor-description"]: + cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]] + # Round to 2 decimal places and convert to string + cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str) + + df = df.merge( + cleaned_df, + how="left", + left_on=feature, + right_on="original_description", + ) + # We now have the thermal transmittance in the data, which we can use to group with the lookup table + df = df.merge( + thermal_transmittance_lookup_table, + how="left", + left_on="thermal_transmittance", + right_on="from", + ) + # Where "to" is populated, replace feature with to + df[feature] = np.where( + ~pd.isnull(df["to"]), + df["to"], + df[feature] + ) + df = df.drop(columns=["original_description", "thermal_transmittance", "from", "to"]) + + # Convert data types + df[NUMERICAL_COLUMNS] = df[NUMERICAL_COLUMNS].apply(pd.to_numeric) + df[CATEGORICAL_COLUMNS] = df[CATEGORICAL_COLUMNS].astype(str) + + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] @@ -54,6 +171,8 @@ def keep_non_zero_rdsap(df): # return df business_logic = { + "add_features_from_code": add_features_from_code, + "remove_hotwaterkwh_bottom_percentile": remove_hotwaterkwh_bottom_percentile # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 3adbe55..571682f 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,7 +22,8 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet + data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -32,15 +33,10 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: lighting_cost_ending + target: hot_water_kwh identifier_columns: ["uprn"] # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"] - drop_columns: [ - "sap_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", - "heating_cost_ending", "hot_water_cost_ending", - # "days_to_starting", "days_to_ending", - 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', - 'number_habitable_rooms', 'number_heated_rooms'] + drop_columns: ["heating_kwh"] retain_features: null # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', # 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index a34651c..6062508 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -21,27 +21,14 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - - sap_ending - - heat_demand_change - - carbon_change - - rdsap_change - - heat_demand_ending - - carbon_ending - - heating_cost_ending - - hot_water_cost_ending - - number_habitable_rooms_starting - - number_habitable_rooms_ending - - number_heated_rooms_starting - - number_heated_rooms_ending - - number_habitable_rooms - - number_heated_rooms + - heating_kwh default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: lighting_cost_ending + default.feature_processor.feature_processor_config.target: hot_water_kwh default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: - s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet + s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -50,8 +37,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 0f11a02cf75c0421757c0b26184cec33.dir - size: 48971227 + md5: 322c8294651dea6c4db9e06157a91ffd.dir + size: 23387145 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -62,8 +49,8 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 0f11a02cf75c0421757c0b26184cec33.dir - size: 48971227 + md5: 322c8294651dea6c4db9e06157a91ffd.dir + size: 23387145 nfiles: 2 params: configs/build_model.yaml: @@ -95,17 +82,17 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 36c41f88681ab90668c17ce63fd9c318.dir - size: 3444201 + md5: b149b2be5ed3105e73b02000b9912422.dir + size: 724848 nfiles: 1 - path: data/model/ hash: md5 - md5: bb9c3f1538e02e20e918ec36a0b7546f.dir - size: 754271944 - nfiles: 37 + md5: 3fe37e27b51fe6d9472252f219fd9126.dir + size: 465478726 + nfiles: 36 - path: metrics/fit_metrics.json hash: md5 - md5: 16ae1efa8ac48d8ed978bb3fa67be64a + md5: c27dcce525b763fa7c2c55820ae72727 size: 225 generate_predictions: cmd: python 3_generate_predictions.py @@ -116,13 +103,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: bb9c3f1538e02e20e918ec36a0b7546f.dir - size: 754271944 - nfiles: 37 + md5: 3fe37e27b51fe6d9472252f219fd9126.dir + size: 465478726 + nfiles: 36 - path: data/prepared_data hash: md5 - md5: 0f11a02cf75c0421757c0b26184cec33.dir - size: 48971227 + md5: 322c8294651dea6c4db9e06157a91ffd.dir + size: 23387145 nfiles: 2 params: configs/settings.yaml: @@ -134,8 +121,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 50909a5b19c2551410e921dc9a92bef7.dir - size: 480359 + md5: 07db4158559475e73ffb06ff95a6c869.dir + size: 77435 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -146,13 +133,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 50909a5b19c2551410e921dc9a92bef7.dir - size: 480359 + md5: 07db4158559475e73ffb06ff95a6c869.dir + size: 77435 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 0f11a02cf75c0421757c0b26184cec33.dir - size: 48971227 + md5: 322c8294651dea6c4db9e06157a91ffd.dir + size: 23387145 nfiles: 2 params: configs/settings.yaml: @@ -162,8 +149,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: d74767b34a1042c9ab0e3d6535791be6 - size: 224 + md5: db8eddb1bb0b190188e25de65bdbd8e8 + size: 220 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: