diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index d2c77e48..ca221175 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -1,16 +1,23 @@ import pandas as pd +from xgboost import XGBRegressor from datetime import datetime from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression -from sklearn.metrics import mean_squared_error, r2_score +from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error +from sklearn.feature_selection import RFECV from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class EnergyConsumptionModel: FEATURES = { "heating_kwh": [ "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", - "heating-cost-current", + "heating-cost-current", "main-fuel", "total-floor-area", "number-heated-rooms", "number-habitable-rooms", + "mainheat-energy-eff" ], "hot_water_kwh": [ "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", @@ -18,34 +25,52 @@ class EnergyConsumptionModel: ] } TARGETS = ['heating_kwh', 'hot_water_kwh'] - CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"] + CATEGORICAL_COLUMNS = [ + "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms", + "number-habitable-rooms", "mainheat-energy-eff" + ] NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current", - "hot-water-cost-current"] + "hot-water-cost-current", "total-floor-area"] def __init__(self, model_paths=None): self.models = {} self.model_paths = model_paths or {} self.data = None + self.input_data = None self.dummy_columns = None + self.training_predictions = {} + self.testing_predictions = {} self.x_train = {} self.x_test = {} self.y_train = {} self.y_test = {} + self.selected_features = {} if model_paths: for target, path in model_paths.items(): self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path) def read_dataset(self, file_path): + """Reads the dataset from the specified file path.""" + logging.info(f"Reading dataset from {file_path}") self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path) + self.input_data = self.data.copy() def feature_engineering(self): - # Extract date features + """Performs feature engineering on the dataset.""" + logging.info("Starting feature engineering") self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"]) self.data["lodgement-year"] = self.data["lodgement-date"].dt.year self.data["lodgement-month"] = self.data["lodgement-date"].dt.month + # Modify number of heated rooms and number of habitable rooms + # self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(lambda x: "10+" if x > 10 else + # str(x)) + # self.data["number-habitable-rooms"] = self.data["number-habitable-rooms"].apply( + # lambda x: "10+" if x > 10 else str(x) + # ) + # Convert data types self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric) self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str) @@ -65,28 +90,97 @@ class EnergyConsumptionModel: dummy_feature_columns.append(feature) self.dummy_columns[target] = dummy_feature_columns - def split_dataset(self, target, test_size=0.2, random_state=42): + logging.info("Feature engineering completed") + def split_dataset(self, target, test_size=0.2, random_state=42): + """Splits the dataset into training and testing sets.""" if target not in self.TARGETS: raise ValueError(f"Target {target} not in {self.TARGETS}") + logging.info(f"Splitting dataset for target {target}") x = self.data[self.dummy_columns[target]] y = self.data[target] self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split( x, y, test_size=test_size, random_state=random_state ) + def feature_selection(self, target): + """Performs feature selection using RFECV.""" + if target not in self.TARGETS: + raise ValueError(f"Target {target} not in {self.TARGETS}") + + logging.info(f"Starting feature selection for target {target}") + x = self.x_train[target] + y = self.y_train[target] + + # Initialize the XGBoost model and RFECV + model = XGBRegressor(objective='reg:squarederror') + selector = RFECV(model, step=1, cv=5, scoring='neg_mean_absolute_percentage_error') + selector = selector.fit(x, y) + + # Get the selected features + self.selected_features[target] = x.columns[selector.support_] + + # Update x_train and x_test with selected features + self.x_train[target] = x[self.selected_features[target]] + self.x_test[target] = self.x_test[target][self.selected_features[target]] + + logging.info(f"Feature selection completed for target {target}") + def fit_model(self, target): - self.models[target] = LinearRegression() + """Fits the linear regression model to the training data.""" + logging.info(f"Fitting model for target {target}") + self.models[target] = XGBRegressor(objective='reg:squarederror') self.models[target].fit(self.x_train[target], self.y_train[target]) + logging.info(f"Model fitting completed for target {target}") def evaluate_model(self, target): - y_pred = self.models[target].predict(self.x_test[target]) - mse = mean_squared_error(self.y_test[target], y_pred) - r2 = r2_score(self.y_test[target], y_pred) - return {'MSE': mse, 'R2': r2} + """Evaluates the model on training and testing data.""" + logging.info(f"Evaluating model for target {target}") + y_train_pred = self.models[target].predict(self.x_train[target]) + train_mse = mean_squared_error(self.y_train[target], y_train_pred) + train_r2 = r2_score(self.y_train[target], y_train_pred) + train_mape = mean_absolute_percentage_error(self.y_train[target], y_train_pred) + + self.training_predictions[target] = pd.DataFrame({ + 'Actual': self.y_train[target], + 'Predicted': y_train_pred + }) + + y_test_pred = self.models[target].predict(self.x_test[target]) + test_mse = mean_squared_error(self.y_test[target], y_test_pred) + test_r2 = r2_score(self.y_test[target], y_test_pred) + test_mape = mean_absolute_percentage_error(self.y_test[target], y_test_pred) + + self.testing_predictions[target] = pd.DataFrame({ + 'Actual': self.y_test[target], + 'Predicted': y_test_pred + }) + + feature_importance = pd.DataFrame({ + 'Feature': self.selected_features[target], + 'Importance': self.models[target].feature_importances_ + }).sort_values(by='Importance', ascending=False) + + logging.info(f"Evaluation completed for target {target}") + + return { + 'train': { + 'MSE': train_mse, + 'R2': train_r2, + 'MAPE': train_mape, + 'Feature Importance': feature_importance + }, + 'test': { + 'MSE': test_mse, + 'R2': test_r2, + 'MAPE': test_mape + } + } def save_model(self, target): + """Saves the model to S3.""" + logging.info(f"Saving model for target {target}") run_date = datetime.now().strftime("%Y-%m-%d") save_pickle_to_s3( self.models[target], @@ -95,14 +189,17 @@ class EnergyConsumptionModel: ) def score_new_data(self, new_data, target): + """Scores new data using the trained model.""" if target not in self.models: raise ValueError(f"Model for target {target} not loaded or trained") - new_data_transformed = self.transform_new_data(new_data) + new_data_transformed = self.transform_new_data(new_data, target) return self.models[target].predict(new_data_transformed) - def transform_new_data(self, new_data): - # Apply the same transformations as in feature_engineering + def transform_new_data(self, new_data, target): + """Applies the same transformations to new data as were applied to the training data.""" + + # TODO THis should jsut use our other transformation function new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"]) new_data["lodgement-year"] = new_data["lodgement-date"].dt.year new_data["lodgement-month"] = new_data["lodgement-date"].dt.month @@ -111,9 +208,12 @@ class EnergyConsumptionModel: new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True) # Align new data with the dummy columns from training data - new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0) + new_data = new_data.reindex(columns=self.dummy_columns[target], fill_value=0) - return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns]) + # Select the features used by the model + new_data = new_data[self.selected_features[target]] + + return new_data # Example usage: @@ -123,12 +223,12 @@ model.feature_engineering() # For heating_kwh model.split_dataset(target='heating_kwh') +model.feature_selection(target='heating_kwh') model.fit_model(target='heating_kwh') -print(model.evaluate_model(target='heating_kwh')) -model.save_model(target='heating_kwh') +evaluation_results = model.evaluate_model(target='heating_kwh') +from pprint import pprint -# For hot_water_kwh -model.split_dataset(target='hot_water_kwh') -model.fit_model(target='hot_water_kwh') -print(model.evaluate_model(target='hot_water_kwh')) -model.save_model(target='hot_water_kwh') +pprint(evaluation_results["train"]) +pprint(evaluation_results["test"]) + +importance_df = evaluation_results["train"]["Feature Importance"]