Added new ecr instances

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-05 12:15:01 +01:00
parent fa6e61f0b9
commit 1320416dc3
3 changed files with 174 additions and 37 deletions

View file

@ -1,10 +1,12 @@
import pandas as pd
import numpy as np
import msgpack
from xgboost import XGBRegressor
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import RFECV
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet, read_from_s3
import logging
from pprint import pprint
@ -14,17 +16,36 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
class EnergyConsumptionModel:
FEATURES = {
# "heating_kwh": [
# "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
# "heating-cost-current",
# "total-floor-area", "number-heated-rooms",
# "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description",
# "property-type", "built-form", "mainheatcont-description", 'hotwater-description', 'hot-water-energy-eff',
# # TESTING
# # "walls-description",
# "walls-energy-eff",
# # "roof-description",
# "roof-energy-eff",
# # "floor-description",
# # "county"
# # "co2-emissions-current", - Made it worse
# # TODO: Should hot water features go in here?
# # , , "number-habitable-rooms",
# #
# #
# #
# ],
"heating_kwh": [
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
"heating-cost-current",
"total-floor-area", "number-heated-rooms",
"mainheat-description", "mainheat-energy-eff", "main-fuel",
# TESTING
"secondheat-description",
# , , "number-habitable-rooms",
# "mainheatcont-description",
# "co2-emissions-current",
# "property-type", "built-form",
"heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms",
"mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", "property-type",
"built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff",
"walls-energy-eff",
"roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff",
"county", "construction-age-band", "co2-emissions-current"
],
"hot_water_kwh": [
"lodgement-year", "lodgement-month",
@ -41,9 +62,15 @@ class EnergyConsumptionModel:
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", "built-form",
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
"county",
"windows-description", "windows-energy-eff", "flat-top-storey",
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff"
]
def __init__(self, model_paths=None, n_jobs=1):
def __init__(self, cleaned, model_paths=None, n_jobs=1):
self.cleaned = cleaned
self.models = {}
self.model_paths = model_paths or {}
self.n_jobs = n_jobs
@ -85,6 +112,55 @@ class EnergyConsumptionModel:
self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many categories
# we group them
ranges = {
"lessthan 0.1": (0, 0.1),
"0.1 - 0.3": (0.1, 0.3),
"0.3 - 0.5": (0.3, 0.5),
"morethan 0.5": (0.5, 2.5),
}
# Generate the lookup table
thermal_transmittance_lookup_table = []
for i in range(1, 251):
value = i / 100
for label, (low, high) in ranges.items():
if low < value <= high:
thermal_transmittance_lookup_table.append({"from": value, "to": label})
break
# Convert to DataFrame for display
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
# Apply the lookup table to the data
for feature in ["walls-description", "roof-description", "floor-description"]:
cleaned_df = pd.DataFrame(self.cleaned[feature])[["original_description", "thermal_transmittance"]]
# Round to 2 decimal places and convert to string
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
self.data = self.data.merge(
cleaned_df,
how="left",
left_on=feature,
right_on="original_description",
)
# We now have the thermal transmittance in the data, which we can use to group with the lookup table
self.data = self.data.merge(
thermal_transmittance_lookup_table,
how="left",
left_on="thermal_transmittance",
right_on="from",
)
# Where "to" is populated, replace feature with to
self.data[feature] = np.where(
~pd.isnull(self.data["to"]),
self.data["to"],
self.data[feature]
)
self.data = self.data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
# Modify number of heated rooms and number of habitable rooms
# self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(lambda x: "10+" if x > 10 else
# str(x))
@ -192,7 +268,8 @@ class EnergyConsumptionModel:
max_depth=6,
subsample=0.8,
colsample_bytree=0.8,
# n_jobs=self.n_jobs
reg_alpha=0.1,
reg_lambda=0.1
)
return XGBRegressor(
@ -200,26 +277,62 @@ class EnergyConsumptionModel:
n_estimators=1000,
learning_rate=0.05,
max_depth=6,
min_child_weight=3,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.1,
reg_lambda=0.1
# n_jobs=self.n_jobs
)
def fit_model(self, target):
"""Fits the linear regression model to the training data."""
"""Fits the model to the training data and removes zero-importance features."""
logging.info(f"Fitting model for target {target}")
self.models[target] = self.init_model()
self.models[target].fit(
# Initialize and fit the model
model = self.init_model()
model.fit(
self.x_train[target],
self.y_train[target],
eval_set=[(self.x_val[target], self.y_val[target])],
early_stopping_rounds=50
)
logging.info(f"Model fitting completed for target {target}")
# Store the model
self.models[target] = model
# Identify and remove zero-importance features
feature_importance = pd.DataFrame({
'Feature': self.x_train[target].columns,
'Importance': model.feature_importances_
})
zero_importance_features = feature_importance[feature_importance['Importance'] == 0]['Feature'].tolist()
if zero_importance_features:
logging.info(f"Removing zero-importance features for target {target}: {zero_importance_features}")
self.x_train[target] = self.x_train[target].drop(columns=zero_importance_features)
self.x_val[target] = self.x_val[target].drop(columns=zero_importance_features)
self.x_test[target] = self.x_test[target].drop(columns=zero_importance_features)
# Re-fit the model with the reduced feature set
model = self.init_model()
model.fit(
self.x_train[target],
self.y_train[target],
eval_set=[(self.x_val[target], self.y_val[target])],
early_stopping_rounds=50
)
# Update the model
self.models[target] = model
# Store the best iteration
self.best_iteration[target] = self.models[target].best_iteration
logging.info(f"Model fitting completed for target {target}")
def re_train_final_model(self, target):
"""Re-trains the final model on the combined training and validation set."""
logging.info(f"Re-training final model for target {target}")
@ -391,16 +504,21 @@ class EnergyConsumptionModel:
return summary
# Example usage:
model = EnergyConsumptionModel(n_jobs=2)
model.read_dataset('energy_consumption/2024-07-04/energy_consumption_dataset.parquet')
# Usage:
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
model = EnergyConsumptionModel(cleaned=cleaned, n_jobs=2)
model.read_dataset('energy_consumption/2024-07-05/energy_consumption_dataset.parquet')
model.feature_engineering()
# For heating_kwh
model.split_dataset(target='heating_kwh')
# model.feature_selection(target='heating_kwh', cv_folds=3, sample_fraction=0.1)
model.fit_model(target='heating_kwh')
model.re_train_final_model(target='heating_kwh')
evaluation_results = model.evaluate_model(target='heating_kwh')
@ -410,8 +528,11 @@ pprint(evaluation_results["test"])
importance_df = evaluation_results["train"]["Feature Importance"]
testing_predictions = model.testing_predictions["heating_kwh"]
testing_predictions = testing_predictions.sort_values("residual", ascending=False)
training_predictions = model.training_predictions["heating_kwh"]
training_predictions = training_predictions.sort_values("residual", ascending=False)
# Merge on model.input_data, by the index
merged_data = testing_predictions.merge(model.input_data, left_index=True, right_index=True)
merged_data_train = training_predictions.merge(model.input_data, left_index=True, right_index=True)
# For hot_water_kwh
model.split_dataset(target='hot_water_kwh')

View file

@ -133,7 +133,7 @@ def app():
energy_consumption_data = []
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
# Skip the first 50
if i < 305:
if i < 36:
continue
data = pd.read_csv(directory / "certificates.csv", low_memory=False)

View file

@ -49,30 +49,30 @@ resource "aws_security_group" "allow_db" {
ingress {
# TLS (change to whatever ports you need)
from_port = 5432
to_port = 5432
protocol = "tcp"
from_port = 5432
to_port = 5432
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
resource "aws_db_instance" "default" {
allocated_storage = var.allocated_storage
engine = "postgres"
engine_version = "14.10"
instance_class = var.instance_class
db_name = var.database_name
username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"]
password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"]
parameter_group_name = "default.postgres14"
skip_final_snapshot = true
allocated_storage = var.allocated_storage
engine = "postgres"
engine_version = "14.10"
instance_class = var.instance_class
db_name = var.database_name
username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"]
password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"]
parameter_group_name = "default.postgres14"
skip_final_snapshot = true
vpc_security_group_ids = [aws_security_group.allow_db.id]
lifecycle {
prevent_destroy = true
@ -187,6 +187,22 @@ module "lambda_heat_prediction_ecr" {
source = "./modules/ecr"
}
# ECR repos for lighting cost, heating cost and hot water cost models
module "lambda_lighting_cost_prediction_ecr" {
ecr_name = "lighting-cost-prediction-${var.stage}"
source = "./modules/ecr"
}
module "lambda_heating_cost_prediction_ecr" {
ecr_name = "heating-cost-prediction-${var.stage}"
source = "./modules/ecr"
}
module "lambda_hot_water_cost_prediction_ecr" {
ecr_name = "hot-water-cost-prediction-${var.stage}"
source = "./modules/ecr"
}
##############################################
# CDN - Cloudfront
##############################################