2026-06-08 11:17:25 +00:00
10 changed files with 86 additions and 256 deletions
--- a/.github/workflows/Deploy.yml
+++ b/.github/workflows/Deploy.yml
@ -2,7 +2,7 @@ name: Sap Change Model Deploy

 on:
  push:
-    branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod, heatingkwh-dev, heatingkwh-prod]
+    branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod]

 jobs:
  deploy:
--- a/.github/workflows/MLPipelinePostMerge.yml
+++ b/.github/workflows/MLPipelinePostMerge.yml
@ -13,7 +13,6 @@ on:
      - "sap-dev"
      - "heat-dev"
      - "carbon-dev"
-      - "heatingkwh-dev"

 permissions: write-all

--- a/.github/workflows/MLPipelinePullRequest.yml
+++ b/.github/workflows/MLPipelinePullRequest.yml
@ -5,7 +5,7 @@ on:
  #   branches:
  #     - "model-**"
  pull_request:
-    branches: ["sap-dev", "heat-dev", "carbon-dev", "heatingkwh-dev"]
+    branches: ["sap-dev", "heat-dev", "carbon-dev"]
  label:
    types: ["created", "edited"]

--- a/MODEL_REGISTRY.md
+++ b/MODEL_REGISTRY.md
@ -16,57 +16,17 @@
        "active": true
    },
    "heat": {
-        "version": "v0.6.0",
+        "version": "v0.5.0",
        "stage": {
-            "dev": "v0.6.0"
+            "dev": "v0.5.0"
        },
        "registered": true,
        "active": true
    },
    "carbon": {
-        "version": "v0.6.0",
+        "version": "v0.5.0",
        "stage": {
-            "dev": "v0.6.0"
-        },
-        "registered": true,
-        "active": true
-    },
-    "hotwater": {
-        "version": "v1.0.0",
-        "stage": {
-            "dev": "v1.0.0"
-        },
-        "registered": true,
-        "active": true
-    },
-    "heating": {
-        "version": "v1.0.0",
-        "stage": {
-            "dev": "v1.0.0"
-        },
-        "registered": true,
-        "active": true
-    },
-    "lighting": {
-        "version": "v1.0.0",
-        "stage": {
-            "dev": "v1.0.0"
-        },
-        "registered": true,
-        "active": true
-    },
-    "hotwaterkwh": {
-        "version": "v1.1.0",
-        "stage": {
-            "dev": "v1.1.0"
-        },
-        "registered": true,
-        "active": true
-    },
-    "heatingkwh": {
-        "version": "v1.2.0",
-        "stage": {
-            "dev": "v1.2.0"
+            "dev": "v0.5.0"
        },
        "registered": true,
        "active": true
--- a/modules/ml-pipeline/README.MD
+++ b/modules/ml-pipeline/README.MD
@ -17,15 +17,14 @@ Within `src` folder, the structure is as follows:

 # How to develop using this pipeline:

-First, download miniconda to use conda to manage Python Environments
-Rund `conda init`, to initialise your terminal
-
-Change to this directory and run `make init`, which will:
- Create a conda virtual environment with this version of python - current 3.10.12
+Run `make init`, which will:
+- Download pyenv (Python version management)
+- Download Python 3.X.X as defined in the `make` file - current 3.10.12
+- Create a virtual environment with this version of python
 - Install packages in the training and version control directories in the pipeline folder (dev version if applicable)
 - Install pre-commit to enable pre-commit hooks

-To use the environment, run `conda activate dev_env_pipeline`
+To use the environment, run `source .dev_env_pipeline/bin/activate`.

 To enable the virtual envrionemnt created in vscode:
 - Open settings
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
@ -5,18 +5,6 @@ During the feature processor step, we can apply additional business logic and fe
 """
 Business Logic dict + functions
 """
-import pandas as pd
-import numpy as np
-import boto3
-import msgpack
-
-s3 = boto3.resource('s3')
-
-# Get the MessagePack data from S3
-obj = s3.Object("retrofit-data-dev", "cleaned_epc_data/cleaned.bson")
-cleaned = obj.get()['Body'].read()
-
-cleaned = msgpack.unpackb(cleaned, raw=False)


 def remove_starting_columns(df):
@ -56,111 +44,6 @@ def keep_non_zero_rdsap(df):
    df = df[df["rdsap_change"] != 0]
    return df

-def remove_heatingkwh_bottom_percentile(df, percentile=0.0001):
-    df = df[df["heating_kwh"] > df["heating_kwh"].quantile(percentile)]
-    return df
-
-def add_features_from_code(df):
-
-    FEATURES = {
-        "heating_kwh": [
-            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
-            "heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms",
-            "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", "property-type",
-            "built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff",
-            "walls-energy-eff",
-            "roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description", "flat-top-storey",
-            "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
-            "low-energy-lighting", "environment-impact-current", "energy-tariff",
-            "county", "construction-age-band", "co2-emissions-current",
-        ],
-        "hot_water_kwh": [
-            "lodgement-year", "lodgement-month",
-            "current-energy-efficiency",
-            "energy-consumption-current",
-            "hot-water-cost-current",
-            "total-floor-area", "number-heated-rooms",
-            "hotwater-description", "hot-water-energy-eff", "main-fuel", "property-type", "built-form",
-            "co2-emissions-current",
-        ]
-    }
-    CATEGORICAL_COLUMNS = [
-        "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
-        "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", "built-form",
-        "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
-        "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
-        "county",
-        "windows-description", "windows-energy-eff", "flat-top-storey",
-        "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
-        "low-energy-lighting", "environment-impact-current", "energy-tariff", "current-energy-rating"
-    ]
-
-    NUMERICAL_COLUMNS = list({
-            x for x in FEATURES["heating_kwh"] + FEATURES["hot_water_kwh"]
-            if x not in CATEGORICAL_COLUMNS
-        })
-    
-
-    """Performs feature engineering on the dataset."""
-    df["lodgement-date"] = pd.to_datetime(df["lodgement-date"])
-    df["lodgement-year"] = df["lodgement-date"].dt.year
-    df["lodgement-month"] = df["lodgement-date"].dt.month
-
-    # For walls, roof, floor description where we have average thermal transmittance, to avoid too many categories
-    # we group them
-    ranges = {
-        "lessthan 0.1": (0, 0.1),
-        "0.1 - 0.3": (0.1, 0.3),
-        "0.3 - 0.5": (0.3, 0.5),
-        "morethan 0.5": (0.5, 2.5),
-    }
-
-    # Generate the lookup table
-    thermal_transmittance_lookup_table = []
-    for i in range(1, 251):
-        value = i / 100
-        for label, (low, high) in ranges.items():
-            if low < value <= high:
-                thermal_transmittance_lookup_table.append({"from": value, "to": label})
-                break
-
-    # Convert to DataFrame for display
-    thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
-    thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
-
-    # Apply the lookup table to the data
-    for feature in ["walls-description", "roof-description", "floor-description"]:
-        cleaned_df = pd.DataFrame(cleaned[feature])[["original_description", "thermal_transmittance"]]
-        # Round to 2 decimal places and convert to string
-        cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
-
-        df = df.merge(
-            cleaned_df,
-            how="left",
-            left_on=feature,
-            right_on="original_description",
-        )
-        # We now have the thermal transmittance in the data, which we can use to group with the lookup table
-        df = df.merge(
-            thermal_transmittance_lookup_table,
-            how="left",
-            left_on="thermal_transmittance",
-            right_on="from",
-        )
-        # Where "to" is populated, replace feature with to
-        df[feature] = np.where(
-            ~pd.isnull(df["to"]),
-            df["to"],
-            df[feature]
-        )
-        df = df.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
-
-    # Convert data types
-    df[NUMERICAL_COLUMNS] = df[NUMERICAL_COLUMNS].apply(pd.to_numeric)
-    df[CATEGORICAL_COLUMNS] = df[CATEGORICAL_COLUMNS].astype(str)
-
-    return df
-

 # def keep_ending_columns(df):
 #     ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
@ -170,41 +53,7 @@ def add_features_from_code(df):
 #     df = df[keep_columns]
 #     return df

-def enforce_minimum_habitable_room_size(df):
-    # Need minimum of 6.5m per habitable room
-    df = df[
-        df["total-floor-area"] / df["number-habitable-rooms"].astype(float) > 6.5
-    ].reset_index(drop=True)
-    return df
-
-def round_to_100s(df):
-    df['heating_kwh'] = (df['heating_kwh']/100).round()*100
-    return df 
-
-def remove_high_ratio_of_area_to_rooms(df):
-    df['area-to-heated-rooms'] = df['total-floor-area'] / df['number-heated-rooms'].astype(float)
-
-    # Remove na rows
-    df = df[(df['area-to-heated-rooms'].notna())].reset_index(drop=True)
-
-    # change any infinite values to 0
-    df['area-to-heated-rooms'] = df['area-to-heated-rooms'].replace([np.inf], 0)
-
-    # Remove top 0.05% of area-to-heated-rooms
-    df = df[df['area-to-heated-rooms'] < df['area-to-heated-rooms'].quantile(0.9995)].reset_index(drop=True)
-    return df
-
-def add_estimate_annual_kwh(df):
-    df['estimate_annual_kwh'] = df['energy-consumption-current'] * df['total-floor-area']
-    return df
-
 business_logic = {
-    "add_features_from_code": add_features_from_code,
-    "remove_heatingkwh_bottom_percentile": remove_heatingkwh_bottom_percentile,
-    # "round_to_100s": round_to_100s,
-    "enforce_minimum_habitable_room_size": enforce_minimum_habitable_room_size,
-    "remove_high_ratio_of_area_to_rooms": remove_high_ratio_of_area_to_rooms,
-    "add_estimate_annual_kwh": add_estimate_annual_kwh,
    # "keep_non_zero_rdsap": keep_non_zero_rdsap,
    # "keep_flats": keep_flats,
    # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
--- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
@ -30,6 +30,6 @@ def clip_predictions_to_minimum_value(


 post_prediction_logic = {
-    # "clip_predictions_to_minimum_value": clip_predictions_to_minimum_value,
+    "clip_predictions_to_minimum_value": clip_predictions_to_minimum_value,
    # "round_predictions": round_predictions
 }
--- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml
@ -8,6 +8,6 @@ default:
      # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
      # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
      # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
-      # - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
+      - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
    comparison_output_filepath: ./metrics/scenario_table.md
    metrics_output_filepath: ./metrics/scenario_metrics.md
--- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml
@ -21,10 +21,7 @@ default:
    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
-    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
-    # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-07-03-23-11-39/dataset_rooms.parquet
-    # data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-08/energy_consumption_dataset.parquet
-    data_filepath: s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet
+    data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
    train_proportion: 0.9
    output_train_filepath: ./data/prepared_data/train.parquet
    output_test_filepath: ./data/prepared_data/test.parquet
@ -34,11 +31,37 @@ default:
    feature_processor_config:
      subsample_amount: null
      subsample_seed: 0
-      target: heating_kwh
+      target: sap_ending
      identifier_columns: ["uprn"]
-      drop_columns: ["hot_water_kwh", 'lodgement-datetime', 'lodgement-date', 'number-habitable-rooms', 'local-authority', 'posttown', 'address', 'inspection-date',
-      "county", "constituency-label", 'address2', 'uprn-source', 'postcode', 'address1',]
+      # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
+      drop_columns: [
+        "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
+        'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
+        'number_habitable_rooms', 'number_heated_rooms']
      retain_features: null
+      # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
+      #  'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',
+      #  'walls_energy_eff_ending', 'secondheat_description_ending',
+      #  'property_type', 'mainheatc_energy_eff_ending', 'built_form',
+      #  'walls_insulation_thickness_ending', 'potential_energy_efficiency',
+      #  'transaction_type_ending',
+      #  'floor_thermal_transmittance_ending',
+      #  'low_energy_lighting_ending', 'heat_demand_starting',
+      #  'photo_supply_ending', 'carbon_starting',
+      #  'walls_thermal_transmittance_ending',
+      #  'roof_insulation_thickness_ending',
+      #  'total_floor_area_ending', 'number_open_fireplaces_ending',
+      #  'windows_energy_eff_ending',
+      #  'floor_height_ending',
+      #  'extension_count_ending',
+      #  'has_air_source_heat_pump_ending',
+      #  'charging_system_ending', 'construction_age_band', 'glazed_type_ending',
+      #  'roof_thermal_transmittance_ending',
+      #  'floor_insulation_thickness_ending', 'has_mains_gas_ending',
+      #  'estimated_perimeter_starting', 'energy_consumption_potential',
+      #  'environment_impact_potential', 'heater_type_ending',
+      #  'multi_glaze_proportion_ending',
+      #  'lighting_energy_eff_ending', 'fixed_lighting_outlets_count']

  generate_predictions:
    input_dataclient_type: local
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@ -21,27 +21,26 @@ stages:
    params:
      configs/settings.yaml:
        default.feature_processor.feature_processor_config.drop_columns:
-        - hot_water_kwh
-        - lodgement-datetime
-        - lodgement-date
-        - number-habitable-rooms
-        - local-authority
-        - posttown
-        - address
-        - inspection-date
-        - county
-        - constituency-label
-        - address2
-        - uprn-source
-        - postcode
-        - address1
+        - heat_demand_change
+        - carbon_change
+        - rdsap_change
+        - heat_demand_ending
+        - carbon_ending
+        - days_to_starting
+        - days_to_ending
+        - number_habitable_rooms_starting
+        - number_habitable_rooms_ending
+        - number_heated_rooms_starting
+        - number_heated_rooms_ending
+        - number_habitable_rooms
+        - number_heated_rooms
        default.feature_processor.feature_processor_config.retain_features:
        default.feature_processor.feature_processor_config.subsample_amount:
        default.feature_processor.feature_processor_config.subsample_seed: 0
-        default.feature_processor.feature_processor_config.target: heating_kwh
+        default.feature_processor.feature_processor_config.target: sap_ending
        default.feature_processor.feature_processor_type: dataframe
-        default.prepare_data.data_filepath: 
-          s3://retrofit-data-dev/energy_consumption/2024-07-25/energy_consumption_dataset.parquet
+        default.prepare_data.data_filepath:
+          s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
        default.prepare_data.input_dataclient_type: aws-s3
        default.prepare_data.output_dataclient_type: local
        default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -50,8 +49,8 @@ stages:
    outs:
    - path: data/prepared_data/
      hash: md5
-      md5: f506f1f059945c0f014c3f505a63726c.dir
-      size: 30388447
+      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
+      size: 45056059
      nfiles: 2
  build_model:
    cmd: python 2_build_model.py
@ -62,8 +61,8 @@ stages:
      size: 4820
    - path: data/prepared_data
      hash: md5
-      md5: f506f1f059945c0f014c3f505a63726c.dir
-      size: 30388447
+      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
+      size: 45056059
      nfiles: 2
    params:
      configs/build_model.yaml:
@ -95,18 +94,18 @@ stages:
    outs:
    - path: data/fit_predictions/
      hash: md5
-      md5: 9a2abeada227b8bb4c13d6c745bef581.dir
-      size: 1547064
+      md5: d9c9afc05e8780db47c0548b19bf7d19.dir
+      size: 3349989
      nfiles: 1
    - path: data/model/
      hash: md5
-      md5: 43b72f9284e92842cbc82bc7cc0950e2.dir
-      size: 506201607
+      md5: 13c3100e1486c27a83a8a47491077842.dir
+      size: 773523079
      nfiles: 36
    - path: metrics/fit_metrics.json
      hash: md5
-      md5: 4a496483bffad3efe671f29110729e48
-      size: 221
+      md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
+      size: 224
  generate_predictions:
    cmd: python 3_generate_predictions.py
    deps:
@ -116,13 +115,13 @@ stages:
      size: 2464
    - path: data/model
      hash: md5
-      md5: 43b72f9284e92842cbc82bc7cc0950e2.dir
-      size: 506201607
+      md5: 13c3100e1486c27a83a8a47491077842.dir
+      size: 773523079
      nfiles: 36
    - path: data/prepared_data
      hash: md5
-      md5: f506f1f059945c0f014c3f505a63726c.dir
-      size: 30388447
+      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
+      size: 45056059
      nfiles: 2
    params:
      configs/settings.yaml:
@ -134,8 +133,8 @@ stages:
    outs:
    - path: data/predictions/
      hash: md5
-      md5: 88832d623c3e437eaec221307ac33aae.dir
-      size: 163584
+      md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
+      size: 463197
      nfiles: 1
  generate_metrics:
    cmd: python 4_generate_metrics.py
@ -146,13 +145,13 @@ stages:
      size: 3484
    - path: data/predictions
      hash: md5
-      md5: 88832d623c3e437eaec221307ac33aae.dir
-      size: 163584
+      md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
+      size: 463197
      nfiles: 1
    - path: data/prepared_data
      hash: md5
-      md5: f506f1f059945c0f014c3f505a63726c.dir
-      size: 30388447
+      md5: 80c9e138146a1d96b9d16091c207e2e8.dir
+      size: 45056059
      nfiles: 2
    params:
      configs/settings.yaml:
@ -162,8 +161,8 @@ stages:
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: f2783bdec0f0974b6d799609c6189467
-      size: 222
+      md5: 3e08df02fd5c5d094bcf936e1338d596
+      size: 223
  generate_scenerio_metrics:
    cmd: python 5_generate_scenarios.py
    deps:
@ -177,14 +176,15 @@ stages:
          input_dataclient_type: aws-s3
          output_dataclient_type: local
          scenario_data_filepaths:
+          - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
          comparison_output_filepath: ./metrics/scenario_table.md
          metrics_output_filepath: ./metrics/scenario_metrics.md
    outs:
    - path: metrics/scenario_metrics.md
      hash: md5
-      md5: d41d8cd98f00b204e9800998ecf8427e
-      size: 0
+      md5: fa4d6d7bbd7818613800da5f8f37ea96
+      size: 363
    - path: metrics/scenario_table.md
      hash: md5
-      md5: d41d8cd98f00b204e9800998ecf8427e
-      size: 0
+      md5: d6baf100a1623cc2467c2f8221d314c9
+      size: 2133