From 523ca28b686da9ef292fcb2dd3f88cb268936d52 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 5 Jul 2024 12:49:23 +0100 Subject: [PATCH] Added new score_new_data function and setting up training script --- etl/bill_savings/EnergyConsumptionModel.py | 104 ++++----------------- etl/bill_savings/training.py | 5 + 2 files changed, 24 insertions(+), 85 deletions(-) create mode 100644 etl/bill_savings/training.py diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index b616be08..14ece803 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -26,8 +26,6 @@ class EnergyConsumptionModel: "flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation", "low-energy-lighting", "environment-impact-current", "energy-tariff", "county", "construction-age-band", "co2-emissions-current", - # TODO: Testing - "lighting-cost-current", "hot-water-cost-current", "current-energy-rating" ], "hot_water_kwh": [ "lodgement-year", "lodgement-month", @@ -144,9 +142,9 @@ class EnergyConsumptionModel: self.data = self.data.drop(columns=["original_description", "thermal_transmittance", "from", "to"]) # Modify number of heated rooms and number of habitable rooms - self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply( - lambda x: "16_or_more" if x > 15 else str(x) - ) + # self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply( + # lambda x: "16_or_more" if x > 15 else str(x) + # ) # self.data["number-habitable-rooms"] = self.data["number-habitable-rooms"].apply( # lambda x: "10+" if x > 10 else str(x) # ) @@ -398,93 +396,29 @@ class EnergyConsumptionModel: if target not in self.models: raise ValueError(f"Model for target {target} not loaded or trained") - new_data_transformed = self.transform_new_data(new_data, target) - return self.models[target].predict(new_data_transformed) + # Verify that self.data is None + if self.data is not None: + raise ValueError("self.data is not None. Ensure that self.data is reset before scoring new data.") - def transform_new_data(self, new_data, target): - """Applies the same transformations to new data as were applied to the training data.""" + # Temporarily set self.data to new data + self.data = new_data.copy() - # TODO THis should jsut use our other transformation function - new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"]) - new_data["lodgement-year"] = new_data["lodgement-date"].dt.year - new_data["lodgement-month"] = new_data["lodgement-date"].dt.month + # Run feature engineering + self.feature_engineering() - # Convert categorical columns to dummies - new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True) + # Select the transformed data + new_data_transformed = self.data[self.dummy_columns[target]] - # Align new data with the dummy columns from training data - new_data = new_data.reindex(columns=self.dummy_columns[target], fill_value=0) + # Ensure the columns match the selected features + new_data_transformed = new_data_transformed[self.selected_features[target]] - # Select the features used by the model - new_data = new_data[self.selected_features[target]] + # Generate predictions + predictions = self.models[target].predict(new_data_transformed) - return new_data + # Reset self.data to None + self.data = None - def error_analysis(self, target, top_n=10, unique_threshold=0.8): - """ - Perform error analysis on the provided model and dataset. - - Parameters: - - target: The target variable to analyze. - - top_n: Number of top residuals to consider for analysis. - - unique_threshold: Threshold to exclude columns with high unique values. - - Returns: - - summary: Dictionary summarizing common features among poorly performing rows. - """ - - # Calculate predictions and residuals - y_train_pred = self.models[target].predict(self.x_train[target]) - y_test_pred = self.models[target].predict(self.x_test[target]) - - train_residuals = self.y_train[target] - y_train_pred - test_residuals = self.y_test[target] - y_test_pred - - # Identify top N poorly performing rows by absolute residuals - top_train_indices = train_residuals.abs().nlargest(top_n).index - top_test_indices = test_residuals.abs().nlargest(top_n).index - - top_train_data = self.input_data.loc[top_train_indices] - top_test_data = self.input_data.loc[top_test_indices] - - # Automatically detect and exclude columns - def exclude_columns(data, threshold): - exclude_cols = [] - num_rows = data.shape[0] - for col in data.columns: - if data[col].dtype == 'object' and data[col].nunique() / num_rows >= threshold: - exclude_cols.append(col) - return exclude_cols - - exclude_cols = exclude_columns(top_train_data, unique_threshold) - - top_train_data = top_train_data.drop(columns=exclude_cols) - top_test_data = top_test_data.drop(columns=exclude_cols) - - # One-hot encode categorical variables - categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist() - top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True) - top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True) - - # Ensure all original columns are included in the encoded data - top_train_data_encoded = top_train_data_encoded.reindex(columns=self.input_data.columns, fill_value=0) - top_test_data_encoded = top_test_data_encoded.reindex(columns=self.input_data.columns, fill_value=0) - - # Correlation analysis with residuals - train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices]) - test_corr = top_test_data_encoded.corrwith(test_residuals.loc[top_test_indices]) - - # Return summaries - summary = { - "train_summary": top_train_data.describe(include='all').T, - "test_summary": top_test_data.describe(include='all').T, - "train_corr": train_corr, - "test_corr": test_corr, - "top_train_data": top_train_data, - "top_test_data": top_test_data - } - - return summary + return predictions # Usage: diff --git a/etl/bill_savings/training.py b/etl/bill_savings/training.py new file mode 100644 index 00000000..2c29c317 --- /dev/null +++ b/etl/bill_savings/training.py @@ -0,0 +1,5 @@ +def hanlder(): + """ + This function is used to train the model and store the final models in s3 as pickles + :return: + """