diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index 27fcc518..6492c7a6 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -16,8 +16,7 @@ class EnergyConsumptionModel: "heating_kwh": [ "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", "heating-cost-current", "total-floor-area", "number-heated-rooms", - # "number-habitable-rooms", - # "mainheat-energy-eff", "mainheat-description", "main-fuel", + "mainheat-description", "main-fuel", "mainheat-energy-eff", "number-habitable-rooms", ], "hot_water_kwh": [ "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", @@ -104,25 +103,41 @@ class EnergyConsumptionModel: x, y, test_size=test_size, random_state=random_state ) - def feature_selection(self, target): - """Performs feature selection using RFECV.""" + def feature_selection(self, target, cv_folds=3, sample_fraction=0.1, random_state=42): + """ + Performs feature selection using RFECV with XGBoost. + + Parameters: + - target: The target variable for feature selection. + - cv_folds: Number of cross-validation folds. + - sample_fraction: Fraction of the data to use for feature selection. + - random_state: Random state for reproducibility. + """ if target not in self.TARGETS: raise ValueError(f"Target {target} not in {self.TARGETS}") logging.info(f"Starting feature selection for target {target}") - x = self.x_train[target] - y = self.y_train[target] + + # Sample the data if specified + if sample_fraction < 1.0: + x_sample, _, y_sample, _ = train_test_split( + self.x_train[target], self.y_train[target], + train_size=sample_fraction, random_state=random_state + ) + else: + x_sample = self.x_train[target] + y_sample = self.y_train[target] # Initialize the XGBoost model and RFECV - model = XGBRegressor(objective='reg:squarederror') - selector = RFECV(model, step=1, cv=5, scoring='neg_mean_absolute_percentage_error') - selector = selector.fit(x, y) + model = XGBRegressor(objective='reg:squarederror', n_jobs=-1) + selector = RFECV(model, step=1, cv=cv_folds, scoring='neg_mean_absolute_percentage_error') + selector = selector.fit(x_sample, y_sample) # Get the selected features - self.selected_features[target] = x.columns[selector.support_] + self.selected_features[target] = x_sample.columns[selector.support_] # Update x_train and x_test with selected features - self.x_train[target] = x[self.selected_features[target]] + self.x_train[target] = self.x_train[target][self.selected_features[target]] self.x_test[target] = self.x_test[target][self.selected_features[target]] logging.info(f"Feature selection completed for target {target}") @@ -218,6 +233,14 @@ class EnergyConsumptionModel: def error_analysis(self, target, top_n=10, unique_threshold=0.8): """ Perform error analysis on the provided model and dataset. + + Parameters: + - target: The target variable to analyze. + - top_n: Number of top residuals to consider for analysis. + - unique_threshold: Threshold to exclude columns with high unique values. + + Returns: + - summary: Dictionary summarizing common features among poorly performing rows. """ # Calculate predictions and residuals @@ -234,6 +257,7 @@ class EnergyConsumptionModel: top_train_data = self.input_data.loc[top_train_indices] top_test_data = self.input_data.loc[top_test_indices] + # Automatically detect and exclude columns def exclude_columns(data, threshold): exclude_cols = [] num_rows = data.shape[0] @@ -247,16 +271,14 @@ class EnergyConsumptionModel: top_train_data = top_train_data.drop(columns=exclude_cols) top_test_data = top_test_data.drop(columns=exclude_cols) - # TODO: Not working - # One-hot encode categorical variables categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist() top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True) top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True) - # Align the encoded data with the training data - top_train_data_encoded = top_train_data_encoded.reindex(columns=self.x_train[target].columns, fill_value=0) - top_test_data_encoded = top_test_data_encoded.reindex(columns=self.x_test[target].columns, fill_value=0) + # Ensure all original columns are included in the encoded data + top_train_data_encoded = top_train_data_encoded.reindex(columns=self.input_data.columns, fill_value=0) + top_test_data_encoded = top_test_data_encoded.reindex(columns=self.input_data.columns, fill_value=0) # Correlation analysis with residuals train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices]) @@ -264,6 +286,8 @@ class EnergyConsumptionModel: # Return summaries summary = { + "train_summary": top_train_data.describe(include='all').T, + "test_summary": top_test_data.describe(include='all').T, "train_corr": train_corr, "test_corr": test_corr, "top_train_data": top_train_data, @@ -280,7 +304,7 @@ model.feature_engineering() # For heating_kwh model.split_dataset(target='heating_kwh') -model.feature_selection(target='heating_kwh') +model.feature_selection(target='heating_kwh', cv_folds=3, sample_fraction=0.1) model.fit_model(target='heating_kwh') evaluation_results = model.evaluate_model(target='heating_kwh') from pprint import pprint