updated to use xgboost - much better performance

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-02 17:29:34 +01:00
parent 7790822e76
commit 39a4c2e975

View file

@ -1,16 +1,23 @@
import pandas as pd
from xgboost import XGBRegressor
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import RFECV
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class EnergyConsumptionModel:
FEATURES = {
"heating_kwh": [
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
"heating-cost-current",
"heating-cost-current", "main-fuel", "total-floor-area", "number-heated-rooms", "number-habitable-rooms",
"mainheat-energy-eff"
],
"hot_water_kwh": [
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
@ -18,34 +25,52 @@ class EnergyConsumptionModel:
]
}
TARGETS = ['heating_kwh', 'hot_water_kwh']
CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"]
CATEGORICAL_COLUMNS = [
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
"number-habitable-rooms", "mainheat-energy-eff"
]
NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current",
"hot-water-cost-current"]
"hot-water-cost-current", "total-floor-area"]
def __init__(self, model_paths=None):
self.models = {}
self.model_paths = model_paths or {}
self.data = None
self.input_data = None
self.dummy_columns = None
self.training_predictions = {}
self.testing_predictions = {}
self.x_train = {}
self.x_test = {}
self.y_train = {}
self.y_test = {}
self.selected_features = {}
if model_paths:
for target, path in model_paths.items():
self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path)
def read_dataset(self, file_path):
"""Reads the dataset from the specified file path."""
logging.info(f"Reading dataset from {file_path}")
self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path)
self.input_data = self.data.copy()
def feature_engineering(self):
# Extract date features
"""Performs feature engineering on the dataset."""
logging.info("Starting feature engineering")
self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"])
self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
# Modify number of heated rooms and number of habitable rooms
# self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(lambda x: "10+" if x > 10 else
# str(x))
# self.data["number-habitable-rooms"] = self.data["number-habitable-rooms"].apply(
# lambda x: "10+" if x > 10 else str(x)
# )
# Convert data types
self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str)
@ -65,28 +90,97 @@ class EnergyConsumptionModel:
dummy_feature_columns.append(feature)
self.dummy_columns[target] = dummy_feature_columns
def split_dataset(self, target, test_size=0.2, random_state=42):
logging.info("Feature engineering completed")
def split_dataset(self, target, test_size=0.2, random_state=42):
"""Splits the dataset into training and testing sets."""
if target not in self.TARGETS:
raise ValueError(f"Target {target} not in {self.TARGETS}")
logging.info(f"Splitting dataset for target {target}")
x = self.data[self.dummy_columns[target]]
y = self.data[target]
self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split(
x, y, test_size=test_size, random_state=random_state
)
def feature_selection(self, target):
"""Performs feature selection using RFECV."""
if target not in self.TARGETS:
raise ValueError(f"Target {target} not in {self.TARGETS}")
logging.info(f"Starting feature selection for target {target}")
x = self.x_train[target]
y = self.y_train[target]
# Initialize the XGBoost model and RFECV
model = XGBRegressor(objective='reg:squarederror')
selector = RFECV(model, step=1, cv=5, scoring='neg_mean_absolute_percentage_error')
selector = selector.fit(x, y)
# Get the selected features
self.selected_features[target] = x.columns[selector.support_]
# Update x_train and x_test with selected features
self.x_train[target] = x[self.selected_features[target]]
self.x_test[target] = self.x_test[target][self.selected_features[target]]
logging.info(f"Feature selection completed for target {target}")
def fit_model(self, target):
self.models[target] = LinearRegression()
"""Fits the linear regression model to the training data."""
logging.info(f"Fitting model for target {target}")
self.models[target] = XGBRegressor(objective='reg:squarederror')
self.models[target].fit(self.x_train[target], self.y_train[target])
logging.info(f"Model fitting completed for target {target}")
def evaluate_model(self, target):
y_pred = self.models[target].predict(self.x_test[target])
mse = mean_squared_error(self.y_test[target], y_pred)
r2 = r2_score(self.y_test[target], y_pred)
return {'MSE': mse, 'R2': r2}
"""Evaluates the model on training and testing data."""
logging.info(f"Evaluating model for target {target}")
y_train_pred = self.models[target].predict(self.x_train[target])
train_mse = mean_squared_error(self.y_train[target], y_train_pred)
train_r2 = r2_score(self.y_train[target], y_train_pred)
train_mape = mean_absolute_percentage_error(self.y_train[target], y_train_pred)
self.training_predictions[target] = pd.DataFrame({
'Actual': self.y_train[target],
'Predicted': y_train_pred
})
y_test_pred = self.models[target].predict(self.x_test[target])
test_mse = mean_squared_error(self.y_test[target], y_test_pred)
test_r2 = r2_score(self.y_test[target], y_test_pred)
test_mape = mean_absolute_percentage_error(self.y_test[target], y_test_pred)
self.testing_predictions[target] = pd.DataFrame({
'Actual': self.y_test[target],
'Predicted': y_test_pred
})
feature_importance = pd.DataFrame({
'Feature': self.selected_features[target],
'Importance': self.models[target].feature_importances_
}).sort_values(by='Importance', ascending=False)
logging.info(f"Evaluation completed for target {target}")
return {
'train': {
'MSE': train_mse,
'R2': train_r2,
'MAPE': train_mape,
'Feature Importance': feature_importance
},
'test': {
'MSE': test_mse,
'R2': test_r2,
'MAPE': test_mape
}
}
def save_model(self, target):
"""Saves the model to S3."""
logging.info(f"Saving model for target {target}")
run_date = datetime.now().strftime("%Y-%m-%d")
save_pickle_to_s3(
self.models[target],
@ -95,14 +189,17 @@ class EnergyConsumptionModel:
)
def score_new_data(self, new_data, target):
"""Scores new data using the trained model."""
if target not in self.models:
raise ValueError(f"Model for target {target} not loaded or trained")
new_data_transformed = self.transform_new_data(new_data)
new_data_transformed = self.transform_new_data(new_data, target)
return self.models[target].predict(new_data_transformed)
def transform_new_data(self, new_data):
# Apply the same transformations as in feature_engineering
def transform_new_data(self, new_data, target):
"""Applies the same transformations to new data as were applied to the training data."""
# TODO THis should jsut use our other transformation function
new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
@ -111,9 +208,12 @@ class EnergyConsumptionModel:
new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
# Align new data with the dummy columns from training data
new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0)
new_data = new_data.reindex(columns=self.dummy_columns[target], fill_value=0)
return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns])
# Select the features used by the model
new_data = new_data[self.selected_features[target]]
return new_data
# Example usage:
@ -123,12 +223,12 @@ model.feature_engineering()
# For heating_kwh
model.split_dataset(target='heating_kwh')
model.feature_selection(target='heating_kwh')
model.fit_model(target='heating_kwh')
print(model.evaluate_model(target='heating_kwh'))
model.save_model(target='heating_kwh')
evaluation_results = model.evaluate_model(target='heating_kwh')
from pprint import pprint
# For hot_water_kwh
model.split_dataset(target='hot_water_kwh')
model.fit_model(target='hot_water_kwh')
print(model.evaluate_model(target='hot_water_kwh'))
model.save_model(target='hot_water_kwh')
pprint(evaluation_results["train"])
pprint(evaluation_results["test"])
importance_df = evaluation_results["train"]["Feature Importance"]