mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added new ecr instances
This commit is contained in:
parent
fa6e61f0b9
commit
1320416dc3
3 changed files with 174 additions and 37 deletions
|
|
@ -1,10 +1,12 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import msgpack
|
||||
from xgboost import XGBRegressor
|
||||
from datetime import datetime
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
|
||||
from sklearn.feature_selection import RFECV
|
||||
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
|
||||
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet, read_from_s3
|
||||
import logging
|
||||
from pprint import pprint
|
||||
|
||||
|
|
@ -14,17 +16,36 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
|
|||
|
||||
class EnergyConsumptionModel:
|
||||
FEATURES = {
|
||||
# "heating_kwh": [
|
||||
# "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
|
||||
# "heating-cost-current",
|
||||
# "total-floor-area", "number-heated-rooms",
|
||||
# "mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description",
|
||||
# "property-type", "built-form", "mainheatcont-description", 'hotwater-description', 'hot-water-energy-eff',
|
||||
# # TESTING
|
||||
# # "walls-description",
|
||||
# "walls-energy-eff",
|
||||
# # "roof-description",
|
||||
# "roof-energy-eff",
|
||||
# # "floor-description",
|
||||
# # "county"
|
||||
# # "co2-emissions-current", - Made it worse
|
||||
# # TODO: Should hot water features go in here?
|
||||
# # , , "number-habitable-rooms",
|
||||
# #
|
||||
# #
|
||||
# #
|
||||
# ],
|
||||
"heating_kwh": [
|
||||
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
|
||||
"heating-cost-current",
|
||||
"total-floor-area", "number-heated-rooms",
|
||||
"mainheat-description", "mainheat-energy-eff", "main-fuel",
|
||||
# TESTING
|
||||
"secondheat-description",
|
||||
# , , "number-habitable-rooms",
|
||||
# "mainheatcont-description",
|
||||
# "co2-emissions-current",
|
||||
# "property-type", "built-form",
|
||||
"heating-cost-current", "heating-cost-potential", "total-floor-area", "number-heated-rooms",
|
||||
"mainheat-description", "mainheat-energy-eff", "main-fuel", "secondheat-description", "property-type",
|
||||
"built-form", "mainheatcont-description", "hotwater-description", "hot-water-energy-eff",
|
||||
"walls-energy-eff",
|
||||
"roof-energy-eff", "windows-description", "windows-energy-eff", "floor-description", "flat-top-storey",
|
||||
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
|
||||
"low-energy-lighting", "environment-impact-current", "energy-tariff",
|
||||
"county", "construction-age-band", "co2-emissions-current"
|
||||
],
|
||||
"hot_water_kwh": [
|
||||
"lodgement-year", "lodgement-month",
|
||||
|
|
@ -41,9 +62,15 @@ class EnergyConsumptionModel:
|
|||
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
|
||||
"number-habitable-rooms", "mainheat-energy-eff", "mainheatcont-description", "property-type", "built-form",
|
||||
"construction-age-band", "secondheat-description", "hotwater-description", "hot-water-energy-eff",
|
||||
"walls-description", "walls-energy-eff", "roof-description", "roof-energy-eff", "floor-description",
|
||||
"county",
|
||||
"windows-description", "windows-energy-eff", "flat-top-storey",
|
||||
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
|
||||
"low-energy-lighting", "environment-impact-current", "energy-tariff"
|
||||
]
|
||||
|
||||
def __init__(self, model_paths=None, n_jobs=1):
|
||||
def __init__(self, cleaned, model_paths=None, n_jobs=1):
|
||||
self.cleaned = cleaned
|
||||
self.models = {}
|
||||
self.model_paths = model_paths or {}
|
||||
self.n_jobs = n_jobs
|
||||
|
|
@ -85,6 +112,55 @@ class EnergyConsumptionModel:
|
|||
self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
|
||||
self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
|
||||
|
||||
# For walls, roof, floor description where we have average thermal transmittance, to avoid too many categories
|
||||
# we group them
|
||||
ranges = {
|
||||
"lessthan 0.1": (0, 0.1),
|
||||
"0.1 - 0.3": (0.1, 0.3),
|
||||
"0.3 - 0.5": (0.3, 0.5),
|
||||
"morethan 0.5": (0.5, 2.5),
|
||||
}
|
||||
|
||||
# Generate the lookup table
|
||||
thermal_transmittance_lookup_table = []
|
||||
for i in range(1, 251):
|
||||
value = i / 100
|
||||
for label, (low, high) in ranges.items():
|
||||
if low < value <= high:
|
||||
thermal_transmittance_lookup_table.append({"from": value, "to": label})
|
||||
break
|
||||
|
||||
# Convert to DataFrame for display
|
||||
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
|
||||
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
|
||||
|
||||
# Apply the lookup table to the data
|
||||
for feature in ["walls-description", "roof-description", "floor-description"]:
|
||||
cleaned_df = pd.DataFrame(self.cleaned[feature])[["original_description", "thermal_transmittance"]]
|
||||
# Round to 2 decimal places and convert to string
|
||||
cleaned_df["thermal_transmittance"] = cleaned_df["thermal_transmittance"].round(2).astype(str)
|
||||
|
||||
self.data = self.data.merge(
|
||||
cleaned_df,
|
||||
how="left",
|
||||
left_on=feature,
|
||||
right_on="original_description",
|
||||
)
|
||||
# We now have the thermal transmittance in the data, which we can use to group with the lookup table
|
||||
self.data = self.data.merge(
|
||||
thermal_transmittance_lookup_table,
|
||||
how="left",
|
||||
left_on="thermal_transmittance",
|
||||
right_on="from",
|
||||
)
|
||||
# Where "to" is populated, replace feature with to
|
||||
self.data[feature] = np.where(
|
||||
~pd.isnull(self.data["to"]),
|
||||
self.data["to"],
|
||||
self.data[feature]
|
||||
)
|
||||
self.data = self.data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
|
||||
|
||||
# Modify number of heated rooms and number of habitable rooms
|
||||
# self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(lambda x: "10+" if x > 10 else
|
||||
# str(x))
|
||||
|
|
@ -192,7 +268,8 @@ class EnergyConsumptionModel:
|
|||
max_depth=6,
|
||||
subsample=0.8,
|
||||
colsample_bytree=0.8,
|
||||
# n_jobs=self.n_jobs
|
||||
reg_alpha=0.1,
|
||||
reg_lambda=0.1
|
||||
)
|
||||
|
||||
return XGBRegressor(
|
||||
|
|
@ -200,26 +277,62 @@ class EnergyConsumptionModel:
|
|||
n_estimators=1000,
|
||||
learning_rate=0.05,
|
||||
max_depth=6,
|
||||
min_child_weight=3,
|
||||
subsample=0.8,
|
||||
colsample_bytree=0.8,
|
||||
reg_alpha=0.1,
|
||||
reg_lambda=0.1
|
||||
# n_jobs=self.n_jobs
|
||||
)
|
||||
|
||||
def fit_model(self, target):
|
||||
"""Fits the linear regression model to the training data."""
|
||||
"""Fits the model to the training data and removes zero-importance features."""
|
||||
|
||||
logging.info(f"Fitting model for target {target}")
|
||||
self.models[target] = self.init_model()
|
||||
self.models[target].fit(
|
||||
|
||||
# Initialize and fit the model
|
||||
model = self.init_model()
|
||||
model.fit(
|
||||
self.x_train[target],
|
||||
self.y_train[target],
|
||||
eval_set=[(self.x_val[target], self.y_val[target])],
|
||||
early_stopping_rounds=50
|
||||
)
|
||||
logging.info(f"Model fitting completed for target {target}")
|
||||
|
||||
# Store the model
|
||||
self.models[target] = model
|
||||
|
||||
# Identify and remove zero-importance features
|
||||
feature_importance = pd.DataFrame({
|
||||
'Feature': self.x_train[target].columns,
|
||||
'Importance': model.feature_importances_
|
||||
})
|
||||
zero_importance_features = feature_importance[feature_importance['Importance'] == 0]['Feature'].tolist()
|
||||
|
||||
if zero_importance_features:
|
||||
logging.info(f"Removing zero-importance features for target {target}: {zero_importance_features}")
|
||||
|
||||
self.x_train[target] = self.x_train[target].drop(columns=zero_importance_features)
|
||||
self.x_val[target] = self.x_val[target].drop(columns=zero_importance_features)
|
||||
self.x_test[target] = self.x_test[target].drop(columns=zero_importance_features)
|
||||
|
||||
# Re-fit the model with the reduced feature set
|
||||
model = self.init_model()
|
||||
model.fit(
|
||||
self.x_train[target],
|
||||
self.y_train[target],
|
||||
eval_set=[(self.x_val[target], self.y_val[target])],
|
||||
early_stopping_rounds=50
|
||||
)
|
||||
|
||||
# Update the model
|
||||
self.models[target] = model
|
||||
|
||||
# Store the best iteration
|
||||
self.best_iteration[target] = self.models[target].best_iteration
|
||||
|
||||
logging.info(f"Model fitting completed for target {target}")
|
||||
|
||||
def re_train_final_model(self, target):
|
||||
"""Re-trains the final model on the combined training and validation set."""
|
||||
logging.info(f"Re-training final model for target {target}")
|
||||
|
|
@ -391,16 +504,21 @@ class EnergyConsumptionModel:
|
|||
return summary
|
||||
|
||||
|
||||
# Example usage:
|
||||
model = EnergyConsumptionModel(n_jobs=2)
|
||||
model.read_dataset('energy_consumption/2024-07-04/energy_consumption_dataset.parquet')
|
||||
# Usage:
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
model = EnergyConsumptionModel(cleaned=cleaned, n_jobs=2)
|
||||
model.read_dataset('energy_consumption/2024-07-05/energy_consumption_dataset.parquet')
|
||||
model.feature_engineering()
|
||||
|
||||
# For heating_kwh
|
||||
model.split_dataset(target='heating_kwh')
|
||||
# model.feature_selection(target='heating_kwh', cv_folds=3, sample_fraction=0.1)
|
||||
model.fit_model(target='heating_kwh')
|
||||
|
||||
model.re_train_final_model(target='heating_kwh')
|
||||
evaluation_results = model.evaluate_model(target='heating_kwh')
|
||||
|
||||
|
|
@ -410,8 +528,11 @@ pprint(evaluation_results["test"])
|
|||
importance_df = evaluation_results["train"]["Feature Importance"]
|
||||
testing_predictions = model.testing_predictions["heating_kwh"]
|
||||
testing_predictions = testing_predictions.sort_values("residual", ascending=False)
|
||||
training_predictions = model.training_predictions["heating_kwh"]
|
||||
training_predictions = training_predictions.sort_values("residual", ascending=False)
|
||||
# Merge on model.input_data, by the index
|
||||
merged_data = testing_predictions.merge(model.input_data, left_index=True, right_index=True)
|
||||
merged_data_train = training_predictions.merge(model.input_data, left_index=True, right_index=True)
|
||||
|
||||
# For hot_water_kwh
|
||||
model.split_dataset(target='hot_water_kwh')
|
||||
|
|
|
|||
|
|
@ -133,7 +133,7 @@ def app():
|
|||
energy_consumption_data = []
|
||||
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
|
||||
# Skip the first 50
|
||||
if i < 305:
|
||||
if i < 36:
|
||||
continue
|
||||
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
|
|
|
|||
|
|
@ -49,30 +49,30 @@ resource "aws_security_group" "allow_db" {
|
|||
|
||||
ingress {
|
||||
# TLS (change to whatever ports you need)
|
||||
from_port = 5432
|
||||
to_port = 5432
|
||||
protocol = "tcp"
|
||||
from_port = 5432
|
||||
to_port = 5432
|
||||
protocol = "tcp"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_db_instance" "default" {
|
||||
allocated_storage = var.allocated_storage
|
||||
engine = "postgres"
|
||||
engine_version = "14.10"
|
||||
instance_class = var.instance_class
|
||||
db_name = var.database_name
|
||||
username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"]
|
||||
password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"]
|
||||
parameter_group_name = "default.postgres14"
|
||||
skip_final_snapshot = true
|
||||
allocated_storage = var.allocated_storage
|
||||
engine = "postgres"
|
||||
engine_version = "14.10"
|
||||
instance_class = var.instance_class
|
||||
db_name = var.database_name
|
||||
username = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_username"]
|
||||
password = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string)["db_assessment_model_password"]
|
||||
parameter_group_name = "default.postgres14"
|
||||
skip_final_snapshot = true
|
||||
vpc_security_group_ids = [aws_security_group.allow_db.id]
|
||||
lifecycle {
|
||||
prevent_destroy = true
|
||||
|
|
@ -187,6 +187,22 @@ module "lambda_heat_prediction_ecr" {
|
|||
source = "./modules/ecr"
|
||||
}
|
||||
|
||||
# ECR repos for lighting cost, heating cost and hot water cost models
|
||||
module "lambda_lighting_cost_prediction_ecr" {
|
||||
ecr_name = "lighting-cost-prediction-${var.stage}"
|
||||
source = "./modules/ecr"
|
||||
}
|
||||
|
||||
module "lambda_heating_cost_prediction_ecr" {
|
||||
ecr_name = "heating-cost-prediction-${var.stage}"
|
||||
source = "./modules/ecr"
|
||||
}
|
||||
|
||||
module "lambda_hot_water_cost_prediction_ecr" {
|
||||
ecr_name = "hot-water-cost-prediction-${var.stage}"
|
||||
source = "./modules/ecr"
|
||||
}
|
||||
|
||||
##############################################
|
||||
# CDN - Cloudfront
|
||||
##############################################
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue