updated to use xgboost - much better performance

2026-07-27 23:35:01 +00:00 · 2024-07-02 17:29:34 +01:00 · 2024-07-02 17:29:34 +01:00 · 39a4c2e975
commit 39a4c2e975
parent 7790822e76
1 changed files with 123 additions and 23 deletions
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@ -1,16 +1,23 @@
 import pandas as pd
+from xgboost import XGBRegressor
 from datetime import datetime
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
-from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
+from sklearn.feature_selection import RFECV
 from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


 class EnergyConsumptionModel:
    FEATURES = {
        "heating_kwh": [
            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
-            "heating-cost-current",
+            "heating-cost-current", "main-fuel", "total-floor-area", "number-heated-rooms", "number-habitable-rooms",
+            "mainheat-energy-eff"
        ],
        "hot_water_kwh": [
            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
@ -18,34 +25,52 @@ class EnergyConsumptionModel:
        ]
    }
    TARGETS = ['heating_kwh', 'hot_water_kwh']
-    CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"]
+    CATEGORICAL_COLUMNS = [
+        "lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
+        "number-habitable-rooms", "mainheat-energy-eff"
+    ]
    NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current",
-                         "hot-water-cost-current"]
+                         "hot-water-cost-current", "total-floor-area"]

    def __init__(self, model_paths=None):
        self.models = {}
        self.model_paths = model_paths or {}
        self.data = None
+        self.input_data = None
        self.dummy_columns = None
+        self.training_predictions = {}
+        self.testing_predictions = {}

        self.x_train = {}
        self.x_test = {}
        self.y_train = {}
        self.y_test = {}
+        self.selected_features = {}

        if model_paths:
            for target, path in model_paths.items():
                self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path)

    def read_dataset(self, file_path):
+        """Reads the dataset from the specified file path."""
+        logging.info(f"Reading dataset from {file_path}")
        self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path)
+        self.input_data = self.data.copy()

    def feature_engineering(self):
-        # Extract date features
+        """Performs feature engineering on the dataset."""
+        logging.info("Starting feature engineering")
        self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"])
        self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
        self.data["lodgement-month"] = self.data["lodgement-date"].dt.month

+        # Modify number of heated rooms and number of habitable rooms
+        # self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(lambda x: "10+" if x > 10 else
+        # str(x))
+        # self.data["number-habitable-rooms"] = self.data["number-habitable-rooms"].apply(
+        #     lambda x: "10+" if x > 10 else str(x)
+        # )
+
        # Convert data types
        self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
        self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str)
@ -65,28 +90,97 @@ class EnergyConsumptionModel:
                    dummy_feature_columns.append(feature)
            self.dummy_columns[target] = dummy_feature_columns

-    def split_dataset(self, target, test_size=0.2, random_state=42):
+        logging.info("Feature engineering completed")

+    def split_dataset(self, target, test_size=0.2, random_state=42):
+        """Splits the dataset into training and testing sets."""
        if target not in self.TARGETS:
            raise ValueError(f"Target {target} not in {self.TARGETS}")

+        logging.info(f"Splitting dataset for target {target}")
        x = self.data[self.dummy_columns[target]]
        y = self.data[target]
        self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split(
            x, y, test_size=test_size, random_state=random_state
        )

+    def feature_selection(self, target):
+        """Performs feature selection using RFECV."""
+        if target not in self.TARGETS:
+            raise ValueError(f"Target {target} not in {self.TARGETS}")
+
+        logging.info(f"Starting feature selection for target {target}")
+        x = self.x_train[target]
+        y = self.y_train[target]
+
+        # Initialize the XGBoost model and RFECV
+        model = XGBRegressor(objective='reg:squarederror')
+        selector = RFECV(model, step=1, cv=5, scoring='neg_mean_absolute_percentage_error')
+        selector = selector.fit(x, y)
+
+        # Get the selected features
+        self.selected_features[target] = x.columns[selector.support_]
+
+        # Update x_train and x_test with selected features
+        self.x_train[target] = x[self.selected_features[target]]
+        self.x_test[target] = self.x_test[target][self.selected_features[target]]
+
+        logging.info(f"Feature selection completed for target {target}")
+
    def fit_model(self, target):
-        self.models[target] = LinearRegression()
+        """Fits the linear regression model to the training data."""
+        logging.info(f"Fitting model for target {target}")
+        self.models[target] = XGBRegressor(objective='reg:squarederror')
        self.models[target].fit(self.x_train[target], self.y_train[target])
+        logging.info(f"Model fitting completed for target {target}")

    def evaluate_model(self, target):
-        y_pred = self.models[target].predict(self.x_test[target])
-        mse = mean_squared_error(self.y_test[target], y_pred)
-        r2 = r2_score(self.y_test[target], y_pred)
-        return {'MSE': mse, 'R2': r2}
+        """Evaluates the model on training and testing data."""
+        logging.info(f"Evaluating model for target {target}")
+        y_train_pred = self.models[target].predict(self.x_train[target])
+        train_mse = mean_squared_error(self.y_train[target], y_train_pred)
+        train_r2 = r2_score(self.y_train[target], y_train_pred)
+        train_mape = mean_absolute_percentage_error(self.y_train[target], y_train_pred)
+
+        self.training_predictions[target] = pd.DataFrame({
+            'Actual': self.y_train[target],
+            'Predicted': y_train_pred
+        })
+
+        y_test_pred = self.models[target].predict(self.x_test[target])
+        test_mse = mean_squared_error(self.y_test[target], y_test_pred)
+        test_r2 = r2_score(self.y_test[target], y_test_pred)
+        test_mape = mean_absolute_percentage_error(self.y_test[target], y_test_pred)
+
+        self.testing_predictions[target] = pd.DataFrame({
+            'Actual': self.y_test[target],
+            'Predicted': y_test_pred
+        })
+
+        feature_importance = pd.DataFrame({
+            'Feature': self.selected_features[target],
+            'Importance': self.models[target].feature_importances_
+        }).sort_values(by='Importance', ascending=False)
+
+        logging.info(f"Evaluation completed for target {target}")
+
+        return {
+            'train': {
+                'MSE': train_mse,
+                'R2': train_r2,
+                'MAPE': train_mape,
+                'Feature Importance': feature_importance
+            },
+            'test': {
+                'MSE': test_mse,
+                'R2': test_r2,
+                'MAPE': test_mape
+            }
+        }

    def save_model(self, target):
+        """Saves the model to S3."""
+        logging.info(f"Saving model for target {target}")
        run_date = datetime.now().strftime("%Y-%m-%d")
        save_pickle_to_s3(
            self.models[target],
@ -95,14 +189,17 @@ class EnergyConsumptionModel:
        )

    def score_new_data(self, new_data, target):
+        """Scores new data using the trained model."""
        if target not in self.models:
            raise ValueError(f"Model for target {target} not loaded or trained")

-        new_data_transformed = self.transform_new_data(new_data)
+        new_data_transformed = self.transform_new_data(new_data, target)
        return self.models[target].predict(new_data_transformed)

-    def transform_new_data(self, new_data):
-        # Apply the same transformations as in feature_engineering
+    def transform_new_data(self, new_data, target):
+        """Applies the same transformations to new data as were applied to the training data."""
+
+        # TODO THis should jsut use our other transformation function
        new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
        new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
        new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
@ -111,9 +208,12 @@ class EnergyConsumptionModel:
        new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)

        # Align new data with the dummy columns from training data
-        new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0)
+        new_data = new_data.reindex(columns=self.dummy_columns[target], fill_value=0)

-        return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns])
+        # Select the features used by the model
+        new_data = new_data[self.selected_features[target]]
+
+        return new_data


 # Example usage:
@ -123,12 +223,12 @@ model.feature_engineering()

 # For heating_kwh
 model.split_dataset(target='heating_kwh')
+model.feature_selection(target='heating_kwh')
 model.fit_model(target='heating_kwh')
-print(model.evaluate_model(target='heating_kwh'))
-model.save_model(target='heating_kwh')
+evaluation_results = model.evaluate_model(target='heating_kwh')
+from pprint import pprint

-# For hot_water_kwh
-model.split_dataset(target='hot_water_kwh')
-model.fit_model(target='hot_water_kwh')
-print(model.evaluate_model(target='hot_water_kwh'))
-model.save_model(target='hot_water_kwh')
+pprint(evaluation_results["train"])
+pprint(evaluation_results["test"])
+
+importance_df = evaluation_results["train"]["Feature Importance"]