diff --git a/model_data/app.py b/model_data/app.py index 4ebad3d2..235ebfc1 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -232,6 +232,8 @@ def handler(): # We want to estimate for making improvements on different property components response = "environment-impact-current" + # We could potentially build models by constituency to avoid having too many + # features in the model base_features = [ "property-type", "built-form", @@ -258,14 +260,33 @@ def handler(): ) # Take just entries with U-values model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])] + model_data = model_data[base_features + ["thermal_transmittance", response]] # We need to split the data into a train and test set for model build + categorical_cols = [ + "property-type", "built-form", "number-habitable-rooms", "constituency", + ] + + # If these categorical variables are not of type 'category', convert them + for col in categorical_cols: + model_data[col] = model_data[col].astype('category') + + # Dummy out the categorical variables + training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True) + + # Convert booleans to integer + for col in training_data.columns: + if training_data[col].dtype == bool: + training_data[col] = training_data[col].astype(int) + + if training_data[col].dtype == object: + training_data[col] = training_data[col].astype(float) import statsmodels.api as sm # Assuming 'df' is your DataFrame - X = model_data[base_features + ["thermal_transmittance"]] - Y = model_data[response] + X = training_data.drop(columns=response) + Y = training_data[response] # Add a constant to the independent value X1 = sm.add_constant(X) @@ -277,6 +298,80 @@ def handler(): results = model.fit() print(results.summary()) + import matplotlib.pyplot as plt + import numpy as np + def plot_regression(df): + # Extract the "fit" and "actual" columns from the dataframe + fit = df['fit'] + actual = df['actual'] + + # Create an array of x-values (assumed to be sequential integers) + x = np.arange(len(df)) + + # Plot the fit and actual data + plt.plot(x, fit, color='red', label='Fit') + plt.plot(x, actual, color='blue', label='Actual') + + # Set labels and title + plt.xlabel('Index') + plt.ylabel('Value') + plt.title('Linear Regression - Fit vs Actual') + + # Display legend + plt.legend() + + # Show the plot + plt.show() + + import numpy as np + from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ + median_absolute_error + + def calculate_regression_metrics(y_true, y_pred, n=20): + """ + Calculate the 5 most important accuracy metrics for regression. + + Args: + y_true (array-like): Array of true target values. + y_pred (array-like): Array of predicted target values. + + Returns: + dict: Dictionary containing the calculated metrics. + """ + metrics = {} + + metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred) + metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred) + metrics['R2 Score'] = r2_score(y_true, y_pred) + metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred) + metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred) + + errors = pd.DataFrame() + errors['Fit'] = y_true + errors['Actual'] = y_pred + errors['Residual'] = errors['Actual'] - errors['Fit'] + errors['Absolute Residual'] = np.abs(errors['Residual']) + + worst_errors = errors.nlargest(n, 'Absolute Residual') + + return metrics, worst_errors + + fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues) + + worst_x = model_data[model_data.index.isin(worst_errors.index)] + # The worst errors over index heavily for flats + + fit_df = pd.DataFrame( + { + "fit": results.fittedvalues, + "actual": Y + } + ) + + # Sort on magnitude of actual + fit_df = fit_df.sort_values("actual", ascending=True) + plot_regression(fit_df) + model_data[["thermal_transmittance", response]].corr() summary = model_data.groupby(["property-type", "built-form"], observed=True)[ diff --git a/model_data/requirements.txt b/model_data/requirements.txt index 609cb528..d1dfdd73 100644 --- a/model_data/requirements.txt +++ b/model_data/requirements.txt @@ -15,4 +15,5 @@ pint geopandas mip seaborn -statsmodels \ No newline at end of file +statsmodels +scikit-learn \ No newline at end of file