mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added new score_new_data function and setting up training script
This commit is contained in:
parent
671d219c88
commit
523ca28b68
2 changed files with 24 additions and 85 deletions
|
|
@ -26,8 +26,6 @@ class EnergyConsumptionModel:
|
|||
"flat-storey-count", "unheated-corridor-length", "solar-water-heating-flag", "mechanical-ventilation",
|
||||
"low-energy-lighting", "environment-impact-current", "energy-tariff",
|
||||
"county", "construction-age-band", "co2-emissions-current",
|
||||
# TODO: Testing
|
||||
"lighting-cost-current", "hot-water-cost-current", "current-energy-rating"
|
||||
],
|
||||
"hot_water_kwh": [
|
||||
"lodgement-year", "lodgement-month",
|
||||
|
|
@ -144,9 +142,9 @@ class EnergyConsumptionModel:
|
|||
self.data = self.data.drop(columns=["original_description", "thermal_transmittance", "from", "to"])
|
||||
|
||||
# Modify number of heated rooms and number of habitable rooms
|
||||
self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(
|
||||
lambda x: "16_or_more" if x > 15 else str(x)
|
||||
)
|
||||
# self.data["number-heated-rooms"] = self.data["number-heated-rooms"].apply(
|
||||
# lambda x: "16_or_more" if x > 15 else str(x)
|
||||
# )
|
||||
# self.data["number-habitable-rooms"] = self.data["number-habitable-rooms"].apply(
|
||||
# lambda x: "10+" if x > 10 else str(x)
|
||||
# )
|
||||
|
|
@ -398,93 +396,29 @@ class EnergyConsumptionModel:
|
|||
if target not in self.models:
|
||||
raise ValueError(f"Model for target {target} not loaded or trained")
|
||||
|
||||
new_data_transformed = self.transform_new_data(new_data, target)
|
||||
return self.models[target].predict(new_data_transformed)
|
||||
# Verify that self.data is None
|
||||
if self.data is not None:
|
||||
raise ValueError("self.data is not None. Ensure that self.data is reset before scoring new data.")
|
||||
|
||||
def transform_new_data(self, new_data, target):
|
||||
"""Applies the same transformations to new data as were applied to the training data."""
|
||||
# Temporarily set self.data to new data
|
||||
self.data = new_data.copy()
|
||||
|
||||
# TODO THis should jsut use our other transformation function
|
||||
new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
|
||||
new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
|
||||
new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
|
||||
# Run feature engineering
|
||||
self.feature_engineering()
|
||||
|
||||
# Convert categorical columns to dummies
|
||||
new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
|
||||
# Select the transformed data
|
||||
new_data_transformed = self.data[self.dummy_columns[target]]
|
||||
|
||||
# Align new data with the dummy columns from training data
|
||||
new_data = new_data.reindex(columns=self.dummy_columns[target], fill_value=0)
|
||||
# Ensure the columns match the selected features
|
||||
new_data_transformed = new_data_transformed[self.selected_features[target]]
|
||||
|
||||
# Select the features used by the model
|
||||
new_data = new_data[self.selected_features[target]]
|
||||
# Generate predictions
|
||||
predictions = self.models[target].predict(new_data_transformed)
|
||||
|
||||
return new_data
|
||||
# Reset self.data to None
|
||||
self.data = None
|
||||
|
||||
def error_analysis(self, target, top_n=10, unique_threshold=0.8):
|
||||
"""
|
||||
Perform error analysis on the provided model and dataset.
|
||||
|
||||
Parameters:
|
||||
- target: The target variable to analyze.
|
||||
- top_n: Number of top residuals to consider for analysis.
|
||||
- unique_threshold: Threshold to exclude columns with high unique values.
|
||||
|
||||
Returns:
|
||||
- summary: Dictionary summarizing common features among poorly performing rows.
|
||||
"""
|
||||
|
||||
# Calculate predictions and residuals
|
||||
y_train_pred = self.models[target].predict(self.x_train[target])
|
||||
y_test_pred = self.models[target].predict(self.x_test[target])
|
||||
|
||||
train_residuals = self.y_train[target] - y_train_pred
|
||||
test_residuals = self.y_test[target] - y_test_pred
|
||||
|
||||
# Identify top N poorly performing rows by absolute residuals
|
||||
top_train_indices = train_residuals.abs().nlargest(top_n).index
|
||||
top_test_indices = test_residuals.abs().nlargest(top_n).index
|
||||
|
||||
top_train_data = self.input_data.loc[top_train_indices]
|
||||
top_test_data = self.input_data.loc[top_test_indices]
|
||||
|
||||
# Automatically detect and exclude columns
|
||||
def exclude_columns(data, threshold):
|
||||
exclude_cols = []
|
||||
num_rows = data.shape[0]
|
||||
for col in data.columns:
|
||||
if data[col].dtype == 'object' and data[col].nunique() / num_rows >= threshold:
|
||||
exclude_cols.append(col)
|
||||
return exclude_cols
|
||||
|
||||
exclude_cols = exclude_columns(top_train_data, unique_threshold)
|
||||
|
||||
top_train_data = top_train_data.drop(columns=exclude_cols)
|
||||
top_test_data = top_test_data.drop(columns=exclude_cols)
|
||||
|
||||
# One-hot encode categorical variables
|
||||
categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist()
|
||||
top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True)
|
||||
top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True)
|
||||
|
||||
# Ensure all original columns are included in the encoded data
|
||||
top_train_data_encoded = top_train_data_encoded.reindex(columns=self.input_data.columns, fill_value=0)
|
||||
top_test_data_encoded = top_test_data_encoded.reindex(columns=self.input_data.columns, fill_value=0)
|
||||
|
||||
# Correlation analysis with residuals
|
||||
train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices])
|
||||
test_corr = top_test_data_encoded.corrwith(test_residuals.loc[top_test_indices])
|
||||
|
||||
# Return summaries
|
||||
summary = {
|
||||
"train_summary": top_train_data.describe(include='all').T,
|
||||
"test_summary": top_test_data.describe(include='all').T,
|
||||
"train_corr": train_corr,
|
||||
"test_corr": test_corr,
|
||||
"top_train_data": top_train_data,
|
||||
"top_test_data": top_test_data
|
||||
}
|
||||
|
||||
return summary
|
||||
return predictions
|
||||
|
||||
|
||||
# Usage:
|
||||
|
|
|
|||
5
etl/bill_savings/training.py
Normal file
5
etl/bill_savings/training.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
def hanlder():
|
||||
"""
|
||||
This function is used to train the model and store the final models in s3 as pickles
|
||||
:return:
|
||||
"""
|
||||
Loading…
Add table
Reference in a new issue