Added new score_new_data function and setting up training script

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-05 12:49:23 +01:00
parent 671d219c88
commit 523ca28b68
2 changed files with 24 additions and 85 deletions

View file

@ -26,8 +26,6 @@ class EnergyConsumptionModel:
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
"low-energy-lighting", "environment-impact-current", "energy-tariff",
"county", "construction-age-band", "co2-emissions-current",
# TODO: Testing
"lighting-cost-current", "hot-water-cost-current", "current-energy-rating"
],
"hot_water_kwh": [
"lodgement-year", "lodgement-month",
@ -144,9 +142,9 @@ class EnergyConsumptionModel:
self.data = self.data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
# Modify number of heated rooms and number of habitable rooms
self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(
lambda x: "16_or_more" if x > 15 else str(x)
)
# self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(
# lambda x: "16_or_more" if x > 15 else str(x)
# )
# self.data["number-habitable-rooms"] = self.data["number-habitable-rooms"].apply(
# lambda x: "10+" if x > 10 else str(x)
# )
@ -398,93 +396,29 @@ class EnergyConsumptionModel:
if target not in self.models:
raise ValueError(f"Model for target {target} not loaded or trained")
new_data_transformed = self.transform_new_data(new_data, target)
return self.models[target].predict(new_data_transformed)
# Verify that self.data is None
if self.data is not None:
raise ValueError("self.data is not None. Ensure that self.data is reset before scoring new data.")
def transform_new_data(self, new_data, target):
"""Applies the same transformations to new data as were applied to the training data."""
# Temporarily set self.data to new data
self.data = new_data.copy()
# TODO THis should jsut use our other transformation function
new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
# Run feature engineering
self.feature_engineering()
# Convert categorical columns to dummies
new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
# Select the transformed data
new_data_transformed = self.data[self.dummy_columns[target]]
# Align new data with the dummy columns from training data
new_data = new_data.reindex(columns=self.dummy_columns[target], fill_value=0)
# Ensure the columns match the selected features
new_data_transformed = new_data_transformed[self.selected_features[target]]
# Select the features used by the model
new_data = new_data[self.selected_features[target]]
# Generate predictions
predictions = self.models[target].predict(new_data_transformed)
return new_data
# Reset self.data to None
self.data = None
def error_analysis(self, target, top_n=10, unique_threshold=0.8):
"""
Perform error analysis on the provided model and dataset.
Parameters:
- target: The target variable to analyze.
- top_n: Number of top residuals to consider for analysis.
- unique_threshold: Threshold to exclude columns with high unique values.
Returns:
- summary: Dictionary summarizing common features among poorly performing rows.
"""
# Calculate predictions and residuals
y_train_pred = self.models[target].predict(self.x_train[target])
y_test_pred = self.models[target].predict(self.x_test[target])
train_residuals = self.y_train[target] - y_train_pred
test_residuals = self.y_test[target] - y_test_pred
# Identify top N poorly performing rows by absolute residuals
top_train_indices = train_residuals.abs().nlargest(top_n).index
top_test_indices = test_residuals.abs().nlargest(top_n).index
top_train_data = self.input_data.loc[top_train_indices]
top_test_data = self.input_data.loc[top_test_indices]
# Automatically detect and exclude columns
def exclude_columns(data, threshold):
exclude_cols = []
num_rows = data.shape[0]
for col in data.columns:
if data[col].dtype == 'object' and data[col].nunique() / num_rows >= threshold:
exclude_cols.append(col)
return exclude_cols
exclude_cols = exclude_columns(top_train_data, unique_threshold)
top_train_data = top_train_data.drop(columns=exclude_cols)
top_test_data = top_test_data.drop(columns=exclude_cols)
# One-hot encode categorical variables
categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist()
top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True)
top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True)
# Ensure all original columns are included in the encoded data
top_train_data_encoded = top_train_data_encoded.reindex(columns=self.input_data.columns, fill_value=0)
top_test_data_encoded = top_test_data_encoded.reindex(columns=self.input_data.columns, fill_value=0)
# Correlation analysis with residuals
train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices])
test_corr = top_test_data_encoded.corrwith(test_residuals.loc[top_test_indices])
# Return summaries
summary = {
"train_summary": top_train_data.describe(include='all').T,
"test_summary": top_test_data.describe(include='all').T,
"train_corr": train_corr,
"test_corr": test_corr,
"top_train_data": top_train_data,
"top_test_data": top_test_data
}
return summary
return predictions
# Usage:

View file

@ -0,0 +1,5 @@
def hanlder():
"""
This function is used to train the model and store the final models in s3 as pickles
:return:
"""