From 1320416dc355af0170306bc921064744d436f54b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 5 Jul 2024 12:15:01 +0100 Subject: [PATCH] Added new ecr instances --- etl/bill_savings/EnergyConsumptionModel.py | 163 ++++++++++++++++++--- etl/bill_savings/data_collection.py | 2 +- infrastructure/terraform/main.tf | 46 ++++-- 3 files changed, 174 insertions(+), 37 deletions(-) diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index 89847ca1..534b8d60 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -1,10 +1,12 @@ import pandas as pd +import numpy as np +import msgpack from xgboost import XGBRegressor from datetime import datetime from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error from sklearn.feature_selection import RFECV -from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet +from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet, read_from_s3 import logging from pprint import pprint @@ -14,17 +16,36 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %( class EnergyConsumptionModel: FEATURES = { + # "heating_kwh": [ + # "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", + # "heating-cost-current", + # "total-floor-area", "number-heated-rooms", + # "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", + # "property-type", "built-form", "mainheatcont-description", 'hotwater-description', 'hot-water-energy-eff', + # # TESTING + # # "walls-description", + # "walls-energy-eff", + # # "roof-description", + # "roof-energy-eff", + # # "floor-description", + # # "county" + # # "co2-emissions-current", - Made it worse + # # TODO: Should hot water features go in here? + # # , , "number-habitable-rooms", + # # + # # + # # + # ], "heating_kwh": [ "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", - "heating-cost-current", - "total-floor-area", "number-heated-rooms", - "mainheat-description", "mainheat-energy-eff", "main-fuel", - # TESTING - "secondheat-description", - # , , "number-habitable-rooms", - # "mainheatcont-description", - # "co2-emissions-current", - # "property-type", "built-form", + "heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms", + "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", "property-type", + "built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff", + "walls-energy-eff", + "roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description", "flat-top-storey", + "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", + "low-energy-lighting", "environment-impact-current", "energy-tariff", + "county", "construction-age-band", "co2-emissions-current" ], "hot_water_kwh": [ "lodgement-year", "lodgement-month", @@ -41,9 +62,15 @@ class EnergyConsumptionModel: "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms", "number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", "built-form", "construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff", + "walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description", + "county", + "windows-description", "windows-energy-eff", "flat-top-storey", + "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", + "low-energy-lighting", "environment-impact-current", "energy-tariff" ] - def __init__(self, model_paths=None, n_jobs=1): + def __init__(self, cleaned, model_paths=None, n_jobs=1): + self.cleaned = cleaned self.models = {} self.model_paths = model_paths or {} self.n_jobs = n_jobs @@ -85,6 +112,55 @@ class EnergyConsumptionModel: self.data["lodgement-year"] = self.data["lodgement-date"].dt.year self.data["lodgement-month"] = self.data["lodgement-date"].dt.month + # For walls, roof, floor description where we have average thermal transmittance, to avoid too many categories + # we group them + ranges = { + "lessthan 0.1": (0, 0.1), + "0.1 - 0.3": (0.1, 0.3), + "0.3 - 0.5": (0.3, 0.5), + "morethan 0.5": (0.5, 2.5), + } + + # Generate the lookup table + thermal_transmittance_lookup_table = [] + for i in range(1, 251): + value = i / 100 + for label, (low, high) in ranges.items(): + if low < value <= high: + thermal_transmittance_lookup_table.append({"from": value, "to": label}) + break + + # Convert to DataFrame for display + thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table) + thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str) + + # Apply the lookup table to the data + for feature in ["walls-description", "roof-description", "floor-description"]: + cleaned_df = pd.DataFrame(self.cleaned[feature])[["original_description", "thermal_transmittance"]] + # Round to 2 decimal places and convert to string + cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str) + + self.data = self.data.merge( + cleaned_df, + how="left", + left_on=feature, + right_on="original_description", + ) + # We now have the thermal transmittance in the data, which we can use to group with the lookup table + self.data = self.data.merge( + thermal_transmittance_lookup_table, + how="left", + left_on="thermal_transmittance", + right_on="from", + ) + # Where "to" is populated, replace feature with to + self.data[feature] = np.where( + ~pd.isnull(self.data["to"]), + self.data["to"], + self.data[feature] + ) + self.data = self.data.drop(columns=["original_description", "thermal_transmittance", "from", "to"]) + # Modify number of heated rooms and number of habitable rooms # self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(lambda x: "10+" if x > 10 else # str(x)) @@ -192,7 +268,8 @@ class EnergyConsumptionModel: max_depth=6, subsample=0.8, colsample_bytree=0.8, - # n_jobs=self.n_jobs + reg_alpha=0.1, + reg_lambda=0.1 ) return XGBRegressor( @@ -200,26 +277,62 @@ class EnergyConsumptionModel: n_estimators=1000, learning_rate=0.05, max_depth=6, + min_child_weight=3, subsample=0.8, colsample_bytree=0.8, + reg_alpha=0.1, + reg_lambda=0.1 # n_jobs=self.n_jobs ) def fit_model(self, target): - """Fits the linear regression model to the training data.""" + """Fits the model to the training data and removes zero-importance features.""" + logging.info(f"Fitting model for target {target}") - self.models[target] = self.init_model() - self.models[target].fit( + + # Initialize and fit the model + model = self.init_model() + model.fit( self.x_train[target], self.y_train[target], eval_set=[(self.x_val[target], self.y_val[target])], early_stopping_rounds=50 ) - logging.info(f"Model fitting completed for target {target}") + + # Store the model + self.models[target] = model + + # Identify and remove zero-importance features + feature_importance = pd.DataFrame({ + 'Feature': self.x_train[target].columns, + 'Importance': model.feature_importances_ + }) + zero_importance_features = feature_importance[feature_importance['Importance'] == 0]['Feature'].tolist() + + if zero_importance_features: + logging.info(f"Removing zero-importance features for target {target}: {zero_importance_features}") + + self.x_train[target] = self.x_train[target].drop(columns=zero_importance_features) + self.x_val[target] = self.x_val[target].drop(columns=zero_importance_features) + self.x_test[target] = self.x_test[target].drop(columns=zero_importance_features) + + # Re-fit the model with the reduced feature set + model = self.init_model() + model.fit( + self.x_train[target], + self.y_train[target], + eval_set=[(self.x_val[target], self.y_val[target])], + early_stopping_rounds=50 + ) + + # Update the model + self.models[target] = model # Store the best iteration self.best_iteration[target] = self.models[target].best_iteration + logging.info(f"Model fitting completed for target {target}") + def re_train_final_model(self, target): """Re-trains the final model on the combined training and validation set.""" logging.info(f"Re-training final model for target {target}") @@ -391,16 +504,21 @@ class EnergyConsumptionModel: return summary -# Example usage: -model = EnergyConsumptionModel(n_jobs=2) -model.read_dataset('energy_consumption/2024-07-04/energy_consumption_dataset.parquet') +# Usage: +cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" +) + +cleaned = msgpack.unpackb(cleaned, raw=False) + +model = EnergyConsumptionModel(cleaned=cleaned, n_jobs=2) +model.read_dataset('energy_consumption/2024-07-05/energy_consumption_dataset.parquet') model.feature_engineering() # For heating_kwh model.split_dataset(target='heating_kwh') -# model.feature_selection(target='heating_kwh', cv_folds=3, sample_fraction=0.1) model.fit_model(target='heating_kwh') - model.re_train_final_model(target='heating_kwh') evaluation_results = model.evaluate_model(target='heating_kwh') @@ -410,8 +528,11 @@ pprint(evaluation_results["test"]) importance_df = evaluation_results["train"]["Feature Importance"] testing_predictions = model.testing_predictions["heating_kwh"] testing_predictions = testing_predictions.sort_values("residual", ascending=False) +training_predictions = model.training_predictions["heating_kwh"] +training_predictions = training_predictions.sort_values("residual", ascending=False) # Merge on model.input_data, by the index merged_data = testing_predictions.merge(model.input_data, left_index=True, right_index=True) +merged_data_train = training_predictions.merge(model.input_data, left_index=True, right_index=True) # For hot_water_kwh model.split_dataset(target='hot_water_kwh') diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index ecc62015..4d913e8f 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -133,7 +133,7 @@ def app(): energy_consumption_data = [] for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)): # Skip the first 50 - if i < 305: + if i < 36: continue data = pd.read_csv(directory / "certificates.csv", low_memory=False) diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index 0da850c5..f968aba8 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -49,30 +49,30 @@ resource "aws_security_group" "allow_db" { ingress { # TLS (change to whatever ports you need) - from_port = 5432 - to_port = 5432 - protocol = "tcp" + from_port = 5432 + to_port = 5432 + protocol = "tcp" cidr_blocks = ["0.0.0.0/0"] } egress { - from_port = 0 - to_port = 0 - protocol = "-1" + from_port = 0 + to_port = 0 + protocol = "-1" cidr_blocks = ["0.0.0.0/0"] } } resource "aws_db_instance" "default" { - allocated_storage = var.allocated_storage - engine = "postgres" - engine_version = "14.10" - instance_class = var.instance_class - db_name = var.database_name - username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"] - password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"] - parameter_group_name = "default.postgres14" - skip_final_snapshot = true + allocated_storage = var.allocated_storage + engine = "postgres" + engine_version = "14.10" + instance_class = var.instance_class + db_name = var.database_name + username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"] + password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"] + parameter_group_name = "default.postgres14" + skip_final_snapshot = true vpc_security_group_ids = [aws_security_group.allow_db.id] lifecycle { prevent_destroy = true @@ -187,6 +187,22 @@ module "lambda_heat_prediction_ecr" { source = "./modules/ecr" } +# ECR repos for lighting cost, heating cost and hot water cost models +module "lambda_lighting_cost_prediction_ecr" { + ecr_name = "lighting-cost-prediction-${var.stage}" + source = "./modules/ecr" +} + +module "lambda_heating_cost_prediction_ecr" { + ecr_name = "heating-cost-prediction-${var.stage}" + source = "./modules/ecr" +} + +module "lambda_hot_water_cost_prediction_ecr" { + ecr_name = "hot-water-cost-prediction-${var.stage}" + source = "./modules/ecr" +} + ############################################## # CDN - Cloudfront ##############################################