mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
updated to use xgboost - much better performance
This commit is contained in:
parent
7790822e76
commit
39a4c2e975
1 changed files with 123 additions and 23 deletions
|
|
@ -1,16 +1,23 @@
|
|||
import pandas as pd
|
||||
from xgboost import XGBRegressor
|
||||
from datetime import datetime
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
|
||||
from sklearn.feature_selection import RFECV
|
||||
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
|
||||
import logging
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
|
||||
class EnergyConsumptionModel:
|
||||
FEATURES = {
|
||||
"heating_kwh": [
|
||||
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
|
||||
"heating-cost-current",
|
||||
"heating-cost-current", "main-fuel", "total-floor-area", "number-heated-rooms", "number-habitable-rooms",
|
||||
"mainheat-energy-eff"
|
||||
],
|
||||
"hot_water_kwh": [
|
||||
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
|
||||
|
|
@ -18,34 +25,52 @@ class EnergyConsumptionModel:
|
|||
]
|
||||
}
|
||||
TARGETS = ['heating_kwh', 'hot_water_kwh']
|
||||
CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"]
|
||||
CATEGORICAL_COLUMNS = [
|
||||
"lodgement-year", "lodgement-month", "main-fuel", "mainheat-description", "number-heated-rooms",
|
||||
"number-habitable-rooms", "mainheat-energy-eff"
|
||||
]
|
||||
NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current",
|
||||
"hot-water-cost-current"]
|
||||
"hot-water-cost-current", "total-floor-area"]
|
||||
|
||||
def __init__(self, model_paths=None):
|
||||
self.models = {}
|
||||
self.model_paths = model_paths or {}
|
||||
self.data = None
|
||||
self.input_data = None
|
||||
self.dummy_columns = None
|
||||
self.training_predictions = {}
|
||||
self.testing_predictions = {}
|
||||
|
||||
self.x_train = {}
|
||||
self.x_test = {}
|
||||
self.y_train = {}
|
||||
self.y_test = {}
|
||||
self.selected_features = {}
|
||||
|
||||
if model_paths:
|
||||
for target, path in model_paths.items():
|
||||
self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path)
|
||||
|
||||
def read_dataset(self, file_path):
|
||||
"""Reads the dataset from the specified file path."""
|
||||
logging.info(f"Reading dataset from {file_path}")
|
||||
self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path)
|
||||
self.input_data = self.data.copy()
|
||||
|
||||
def feature_engineering(self):
|
||||
# Extract date features
|
||||
"""Performs feature engineering on the dataset."""
|
||||
logging.info("Starting feature engineering")
|
||||
self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"])
|
||||
self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
|
||||
self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
|
||||
|
||||
# Modify number of heated rooms and number of habitable rooms
|
||||
# self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(lambda x: "10+" if x > 10 else
|
||||
# str(x))
|
||||
# self.data["number-habitable-rooms"] = self.data["number-habitable-rooms"].apply(
|
||||
# lambda x: "10+" if x > 10 else str(x)
|
||||
# )
|
||||
|
||||
# Convert data types
|
||||
self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
|
||||
self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str)
|
||||
|
|
@ -65,28 +90,97 @@ class EnergyConsumptionModel:
|
|||
dummy_feature_columns.append(feature)
|
||||
self.dummy_columns[target] = dummy_feature_columns
|
||||
|
||||
def split_dataset(self, target, test_size=0.2, random_state=42):
|
||||
logging.info("Feature engineering completed")
|
||||
|
||||
def split_dataset(self, target, test_size=0.2, random_state=42):
|
||||
"""Splits the dataset into training and testing sets."""
|
||||
if target not in self.TARGETS:
|
||||
raise ValueError(f"Target {target} not in {self.TARGETS}")
|
||||
|
||||
logging.info(f"Splitting dataset for target {target}")
|
||||
x = self.data[self.dummy_columns[target]]
|
||||
y = self.data[target]
|
||||
self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split(
|
||||
x, y, test_size=test_size, random_state=random_state
|
||||
)
|
||||
|
||||
def feature_selection(self, target):
|
||||
"""Performs feature selection using RFECV."""
|
||||
if target not in self.TARGETS:
|
||||
raise ValueError(f"Target {target} not in {self.TARGETS}")
|
||||
|
||||
logging.info(f"Starting feature selection for target {target}")
|
||||
x = self.x_train[target]
|
||||
y = self.y_train[target]
|
||||
|
||||
# Initialize the XGBoost model and RFECV
|
||||
model = XGBRegressor(objective='reg:squarederror')
|
||||
selector = RFECV(model, step=1, cv=5, scoring='neg_mean_absolute_percentage_error')
|
||||
selector = selector.fit(x, y)
|
||||
|
||||
# Get the selected features
|
||||
self.selected_features[target] = x.columns[selector.support_]
|
||||
|
||||
# Update x_train and x_test with selected features
|
||||
self.x_train[target] = x[self.selected_features[target]]
|
||||
self.x_test[target] = self.x_test[target][self.selected_features[target]]
|
||||
|
||||
logging.info(f"Feature selection completed for target {target}")
|
||||
|
||||
def fit_model(self, target):
|
||||
self.models[target] = LinearRegression()
|
||||
"""Fits the linear regression model to the training data."""
|
||||
logging.info(f"Fitting model for target {target}")
|
||||
self.models[target] = XGBRegressor(objective='reg:squarederror')
|
||||
self.models[target].fit(self.x_train[target], self.y_train[target])
|
||||
logging.info(f"Model fitting completed for target {target}")
|
||||
|
||||
def evaluate_model(self, target):
|
||||
y_pred = self.models[target].predict(self.x_test[target])
|
||||
mse = mean_squared_error(self.y_test[target], y_pred)
|
||||
r2 = r2_score(self.y_test[target], y_pred)
|
||||
return {'MSE': mse, 'R2': r2}
|
||||
"""Evaluates the model on training and testing data."""
|
||||
logging.info(f"Evaluating model for target {target}")
|
||||
y_train_pred = self.models[target].predict(self.x_train[target])
|
||||
train_mse = mean_squared_error(self.y_train[target], y_train_pred)
|
||||
train_r2 = r2_score(self.y_train[target], y_train_pred)
|
||||
train_mape = mean_absolute_percentage_error(self.y_train[target], y_train_pred)
|
||||
|
||||
self.training_predictions[target] = pd.DataFrame({
|
||||
'Actual': self.y_train[target],
|
||||
'Predicted': y_train_pred
|
||||
})
|
||||
|
||||
y_test_pred = self.models[target].predict(self.x_test[target])
|
||||
test_mse = mean_squared_error(self.y_test[target], y_test_pred)
|
||||
test_r2 = r2_score(self.y_test[target], y_test_pred)
|
||||
test_mape = mean_absolute_percentage_error(self.y_test[target], y_test_pred)
|
||||
|
||||
self.testing_predictions[target] = pd.DataFrame({
|
||||
'Actual': self.y_test[target],
|
||||
'Predicted': y_test_pred
|
||||
})
|
||||
|
||||
feature_importance = pd.DataFrame({
|
||||
'Feature': self.selected_features[target],
|
||||
'Importance': self.models[target].feature_importances_
|
||||
}).sort_values(by='Importance', ascending=False)
|
||||
|
||||
logging.info(f"Evaluation completed for target {target}")
|
||||
|
||||
return {
|
||||
'train': {
|
||||
'MSE': train_mse,
|
||||
'R2': train_r2,
|
||||
'MAPE': train_mape,
|
||||
'Feature Importance': feature_importance
|
||||
},
|
||||
'test': {
|
||||
'MSE': test_mse,
|
||||
'R2': test_r2,
|
||||
'MAPE': test_mape
|
||||
}
|
||||
}
|
||||
|
||||
def save_model(self, target):
|
||||
"""Saves the model to S3."""
|
||||
logging.info(f"Saving model for target {target}")
|
||||
run_date = datetime.now().strftime("%Y-%m-%d")
|
||||
save_pickle_to_s3(
|
||||
self.models[target],
|
||||
|
|
@ -95,14 +189,17 @@ class EnergyConsumptionModel:
|
|||
)
|
||||
|
||||
def score_new_data(self, new_data, target):
|
||||
"""Scores new data using the trained model."""
|
||||
if target not in self.models:
|
||||
raise ValueError(f"Model for target {target} not loaded or trained")
|
||||
|
||||
new_data_transformed = self.transform_new_data(new_data)
|
||||
new_data_transformed = self.transform_new_data(new_data, target)
|
||||
return self.models[target].predict(new_data_transformed)
|
||||
|
||||
def transform_new_data(self, new_data):
|
||||
# Apply the same transformations as in feature_engineering
|
||||
def transform_new_data(self, new_data, target):
|
||||
"""Applies the same transformations to new data as were applied to the training data."""
|
||||
|
||||
# TODO THis should jsut use our other transformation function
|
||||
new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
|
||||
new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
|
||||
new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
|
||||
|
|
@ -111,9 +208,12 @@ class EnergyConsumptionModel:
|
|||
new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
|
||||
|
||||
# Align new data with the dummy columns from training data
|
||||
new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0)
|
||||
new_data = new_data.reindex(columns=self.dummy_columns[target], fill_value=0)
|
||||
|
||||
return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns])
|
||||
# Select the features used by the model
|
||||
new_data = new_data[self.selected_features[target]]
|
||||
|
||||
return new_data
|
||||
|
||||
|
||||
# Example usage:
|
||||
|
|
@ -123,12 +223,12 @@ model.feature_engineering()
|
|||
|
||||
# For heating_kwh
|
||||
model.split_dataset(target='heating_kwh')
|
||||
model.feature_selection(target='heating_kwh')
|
||||
model.fit_model(target='heating_kwh')
|
||||
print(model.evaluate_model(target='heating_kwh'))
|
||||
model.save_model(target='heating_kwh')
|
||||
evaluation_results = model.evaluate_model(target='heating_kwh')
|
||||
from pprint import pprint
|
||||
|
||||
# For hot_water_kwh
|
||||
model.split_dataset(target='hot_water_kwh')
|
||||
model.fit_model(target='hot_water_kwh')
|
||||
print(model.evaluate_model(target='hot_water_kwh'))
|
||||
model.save_model(target='hot_water_kwh')
|
||||
pprint(evaluation_results["train"])
|
||||
pprint(evaluation_results["test"])
|
||||
|
||||
importance_df = evaluation_results["train"]["Feature Importance"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue