error analysis - not working though

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-02 18:13:23 +01:00
parent 0a1f728f37
commit 14417c37df

View file

@ -15,7 +15,8 @@ class EnergyConsumptionModel:
FEATURES = {
"heating_kwh": [
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
"heating-cost-current", "total-floor-area", "number-heated-rooms", "number-habitable-rooms",
"heating-cost-current", "total-floor-area", "number-heated-rooms",
# "number-habitable-rooms",
# "mainheat-energy-eff", "mainheat-description", "main-fuel",
],
"hot_water_kwh": [
@ -214,6 +215,63 @@ class EnergyConsumptionModel:
return new_data
def error_analysis(self, target, top_n=10, unique_threshold=0.8):
"""
Perform error analysis on the provided model and dataset.
"""
# Calculate predictions and residuals
y_train_pred = self.models[target].predict(self.x_train[target])
y_test_pred = self.models[target].predict(self.x_test[target])
train_residuals = self.y_train[target] - y_train_pred
test_residuals = self.y_test[target] - y_test_pred
# Identify top N poorly performing rows by absolute residuals
top_train_indices = train_residuals.abs().nlargest(top_n).index
top_test_indices = test_residuals.abs().nlargest(top_n).index
top_train_data = self.input_data.loc[top_train_indices]
top_test_data = self.input_data.loc[top_test_indices]
def exclude_columns(data, threshold):
exclude_cols = []
num_rows = data.shape[0]
for col in data.columns:
if data[col].dtype == 'object' and data[col].nunique() / num_rows >= threshold:
exclude_cols.append(col)
return exclude_cols
exclude_cols = exclude_columns(top_train_data, unique_threshold)
top_train_data = top_train_data.drop(columns=exclude_cols)
top_test_data = top_test_data.drop(columns=exclude_cols)
# TODO: Not working
# One-hot encode categorical variables
categorical_columns = top_train_data.select_dtypes(include=['object']).columns.tolist()
top_train_data_encoded = pd.get_dummies(top_train_data, columns=categorical_columns, drop_first=True)
top_test_data_encoded = pd.get_dummies(top_test_data, columns=categorical_columns, drop_first=True)
# Align the encoded data with the training data
top_train_data_encoded = top_train_data_encoded.reindex(columns=self.x_train[target].columns, fill_value=0)
top_test_data_encoded = top_test_data_encoded.reindex(columns=self.x_test[target].columns, fill_value=0)
# Correlation analysis with residuals
train_corr = top_train_data_encoded.corrwith(train_residuals.loc[top_train_indices])
test_corr = top_test_data_encoded.corrwith(test_residuals.loc[top_test_indices])
# Return summaries
summary = {
"train_corr": train_corr,
"test_corr": test_corr,
"top_train_data": top_train_data,
"top_test_data": top_test_data
}
return summary
# Example usage:
model = EnergyConsumptionModel()