From 1320416dc355af0170306bc921064744d436f54b Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 5 Jul 2024 12:15:01 +0100
Subject: [PATCH] Added new ecr instances

---
 etl/bill_savings/EnergyConsumptionModel.py | 163 ++++++++++++++++++---
 etl/bill_savings/data_collection.py        |   2 +-
 infrastructure/terraform/main.tf           |  46 ++++--
 3 files changed, 174 insertions(+), 37 deletions(-)

diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py
index 89847ca1..534b8d60 100644
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@@ -1,10 +1,12 @@
 import pandas as pd
+import numpy as np
+import msgpack
 from xgboost import XGBRegressor
 from datetime import datetime
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
 from sklearn.feature_selection import RFECV
-from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
+from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet, read_from_s3
 import logging
 from pprint import pprint
 
@@ -14,17 +16,36 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 
 class EnergyConsumptionModel:
     FEATURES = {
+        # "heating_kwh": [
+        #     "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
+        #     "heating-cost-current",
+        #     "total-floor-area", "number-heated-rooms",
+        #     "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description",
+        #     "property-type", "built-form", "mainheatcont-description", 'hotwater-description', 'hot-water-energy-eff',
+        #     # TESTING
+        #     # "walls-description",
+        #     "walls-energy-eff",
+        #     # "roof-description",
+        #     "roof-energy-eff",
+        #     # "floor-description",
+        #     # "county"
+        #     # "co2-emissions-current", - Made it worse
+        #     # TODO: Should hot water features go in here?
+        #     # , , "number-habitable-rooms",
+        #     #
+        #     #
+        #     #
+        # ],
         "heating_kwh": [
             "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
-            "heating-cost-current",
-            "total-floor-area", "number-heated-rooms",
-            "mainheat-description", "mainheat-energy-eff", "main-fuel",
-            # TESTING
-            "secondheat-description",
-            # , , "number-habitable-rooms",
-            # "mainheatcont-description",
-            # "co2-emissions-current",
-            # "property-type", "built-form",
+            "heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms",
+            "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", "property-type",
+            "built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff",
+            "walls-energy-eff",
+            "roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description", "flat-top-storey",
+            "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
+            "low-energy-lighting", "environment-impact-current", "energy-tariff",
+            "county", "construction-age-band", "co2-emissions-current"
         ],
         "hot_water_kwh": [
             "lodgement-year", "lodgement-month",
@@ -41,9 +62,15 @@ class EnergyConsumptionModel:
         "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
         "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", "built-form",
         "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
+        "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
+        "county",
+        "windows-description", "windows-energy-eff", "flat-top-storey",
+        "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
+        "low-energy-lighting", "environment-impact-current", "energy-tariff"
     ]
 
-    def __init__(self, model_paths=None, n_jobs=1):
+    def __init__(self, cleaned, model_paths=None, n_jobs=1):
+        self.cleaned = cleaned
         self.models = {}
         self.model_paths = model_paths or {}
         self.n_jobs = n_jobs
@@ -85,6 +112,55 @@ class EnergyConsumptionModel:
         self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
         self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
 
+        # For walls, roof, floor description where we have average thermal transmittance, to avoid too many categories
+        # we group them
+        ranges = {
+            "lessthan 0.1": (0, 0.1),
+            "0.1 - 0.3": (0.1, 0.3),
+            "0.3 - 0.5": (0.3, 0.5),
+            "morethan 0.5": (0.5, 2.5),
+        }
+
+        # Generate the lookup table
+        thermal_transmittance_lookup_table = []
+        for i in range(1, 251):
+            value = i / 100
+            for label, (low, high) in ranges.items():
+                if low < value <= high:
+                    thermal_transmittance_lookup_table.append({"from": value, "to": label})
+                    break
+
+        # Convert to DataFrame for display
+        thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
+        thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
+
+        # Apply the lookup table to the data
+        for feature in ["walls-description", "roof-description", "floor-description"]:
+            cleaned_df = pd.DataFrame(self.cleaned[feature])[["original_description", "thermal_transmittance"]]
+            # Round to 2 decimal places and convert to string
+            cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
+
+            self.data = self.data.merge(
+                cleaned_df,
+                how="left",
+                left_on=feature,
+                right_on="original_description",
+            )
+            # We now have the thermal transmittance in the data, which we can use to group with the lookup table
+            self.data = self.data.merge(
+                thermal_transmittance_lookup_table,
+                how="left",
+                left_on="thermal_transmittance",
+                right_on="from",
+            )
+            # Where "to" is populated, replace feature with to
+            self.data[feature] = np.where(
+                ~pd.isnull(self.data["to"]),
+                self.data["to"],
+                self.data[feature]
+            )
+            self.data = self.data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
+
         # Modify number of heated rooms and number of habitable rooms
         # self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(lambda x: "10+" if x > 10 else
         # str(x))
@@ -192,7 +268,8 @@ class EnergyConsumptionModel:
                 max_depth=6,
                 subsample=0.8,
                 colsample_bytree=0.8,
-                # n_jobs=self.n_jobs
+                reg_alpha=0.1,
+                reg_lambda=0.1
             )
 
         return XGBRegressor(
@@ -200,26 +277,62 @@ class EnergyConsumptionModel:
             n_estimators=1000,
             learning_rate=0.05,
             max_depth=6,
+            min_child_weight=3,
             subsample=0.8,
             colsample_bytree=0.8,
+            reg_alpha=0.1,
+            reg_lambda=0.1
             # n_jobs=self.n_jobs
         )
 
     def fit_model(self, target):
-        """Fits the linear regression model to the training data."""
+        """Fits the model to the training data and removes zero-importance features."""
+
         logging.info(f"Fitting model for target {target}")
-        self.models[target] = self.init_model()
-        self.models[target].fit(
+
+        # Initialize and fit the model
+        model = self.init_model()
+        model.fit(
             self.x_train[target],
             self.y_train[target],
             eval_set=[(self.x_val[target], self.y_val[target])],
             early_stopping_rounds=50
         )
-        logging.info(f"Model fitting completed for target {target}")
+
+        # Store the model
+        self.models[target] = model
+
+        # Identify and remove zero-importance features
+        feature_importance = pd.DataFrame({
+            'Feature': self.x_train[target].columns,
+            'Importance': model.feature_importances_
+        })
+        zero_importance_features = feature_importance[feature_importance['Importance'] == 0]['Feature'].tolist()
+
+        if zero_importance_features:
+            logging.info(f"Removing zero-importance features for target {target}: {zero_importance_features}")
+
+            self.x_train[target] = self.x_train[target].drop(columns=zero_importance_features)
+            self.x_val[target] = self.x_val[target].drop(columns=zero_importance_features)
+            self.x_test[target] = self.x_test[target].drop(columns=zero_importance_features)
+
+            # Re-fit the model with the reduced feature set
+            model = self.init_model()
+            model.fit(
+                self.x_train[target],
+                self.y_train[target],
+                eval_set=[(self.x_val[target], self.y_val[target])],
+                early_stopping_rounds=50
+            )
+
+            # Update the model
+            self.models[target] = model
 
         # Store the best iteration
         self.best_iteration[target] = self.models[target].best_iteration
 
+        logging.info(f"Model fitting completed for target {target}")
+
     def re_train_final_model(self, target):
         """Re-trains the final model on the combined training and validation set."""
         logging.info(f"Re-training final model for target {target}")
@@ -391,16 +504,21 @@ class EnergyConsumptionModel:
         return summary
 
 
-# Example usage:
-model = EnergyConsumptionModel(n_jobs=2)
-model.read_dataset('energy_consumption/2024-07-04/energy_consumption_dataset.parquet')
+# Usage:
+cleaned = read_from_s3(
+    s3_file_name="cleaned_epc_data/cleaned.bson",
+    bucket_name="retrofit-data-dev"
+)
+
+cleaned = msgpack.unpackb(cleaned, raw=False)
+
+model = EnergyConsumptionModel(cleaned=cleaned, n_jobs=2)
+model.read_dataset('energy_consumption/2024-07-05/energy_consumption_dataset.parquet')
 model.feature_engineering()
 
 # For heating_kwh
 model.split_dataset(target='heating_kwh')
-# model.feature_selection(target='heating_kwh', cv_folds=3, sample_fraction=0.1)
 model.fit_model(target='heating_kwh')
-
 model.re_train_final_model(target='heating_kwh')
 evaluation_results = model.evaluate_model(target='heating_kwh')
 
@@ -410,8 +528,11 @@ pprint(evaluation_results["test"])
 importance_df = evaluation_results["train"]["Feature Importance"]
 testing_predictions = model.testing_predictions["heating_kwh"]
 testing_predictions = testing_predictions.sort_values("residual", ascending=False)
+training_predictions = model.training_predictions["heating_kwh"]
+training_predictions = training_predictions.sort_values("residual", ascending=False)
 # Merge on model.input_data, by the index
 merged_data = testing_predictions.merge(model.input_data, left_index=True, right_index=True)
+merged_data_train = training_predictions.merge(model.input_data, left_index=True, right_index=True)
 
 # For hot_water_kwh
 model.split_dataset(target='hot_water_kwh')
diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py
index ecc62015..4d913e8f 100644
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@@ -133,7 +133,7 @@ def app():
     energy_consumption_data = []
     for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
         # Skip the first 50
-        if i < 305:
+        if i < 36:
             continue
 
         data = pd.read_csv(directory / "certificates.csv", low_memory=False)
diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf
index 0da850c5..f968aba8 100644
--- a/infrastructure/terraform/main.tf
+++ b/infrastructure/terraform/main.tf
@@ -49,30 +49,30 @@ resource "aws_security_group" "allow_db" {
 
   ingress {
     # TLS (change to whatever ports you need)
-    from_port   = 5432
-    to_port     = 5432
-    protocol    = "tcp"
+    from_port = 5432
+    to_port   = 5432
+    protocol  = "tcp"
     cidr_blocks = ["0.0.0.0/0"]
   }
 
   egress {
-    from_port   = 0
-    to_port     = 0
-    protocol    = "-1"
+    from_port = 0
+    to_port   = 0
+    protocol  = "-1"
     cidr_blocks = ["0.0.0.0/0"]
   }
 }
 
 resource "aws_db_instance" "default" {
-  allocated_storage      = var.allocated_storage
-  engine                 = "postgres"
-  engine_version         = "14.10"
-  instance_class         = var.instance_class
-  db_name                = var.database_name
-  username               = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"]
-  password               = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"]
-  parameter_group_name   = "default.postgres14"
-  skip_final_snapshot    = true
+  allocated_storage    = var.allocated_storage
+  engine               = "postgres"
+  engine_version       = "14.10"
+  instance_class       = var.instance_class
+  db_name              = var.database_name
+  username             = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"]
+  password             = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"]
+  parameter_group_name = "default.postgres14"
+  skip_final_snapshot  = true
   vpc_security_group_ids = [aws_security_group.allow_db.id]
   lifecycle {
     prevent_destroy = true
@@ -187,6 +187,22 @@ module "lambda_heat_prediction_ecr" {
   source   = "./modules/ecr"
 }
 
+# ECR repos for lighting cost, heating cost and hot water cost models
+module "lambda_lighting_cost_prediction_ecr" {
+  ecr_name = "lighting-cost-prediction-${var.stage}"
+  source   = "./modules/ecr"
+}
+
+module "lambda_heating_cost_prediction_ecr" {
+  ecr_name = "heating-cost-prediction-${var.stage}"
+  source   = "./modules/ecr"
+}
+
+module "lambda_hot_water_cost_prediction_ecr" {
+  ecr_name = "hot-water-cost-prediction-${var.stage}"
+  source   = "./modules/ecr"
+}
+
 ##############################################
 # CDN - Cloudfront
 ##############################################