poc sap model wip, probably need full panel of data

This commit is contained in:
Khalim Conn-Kowlessar 2023-06-30 18:34:41 +01:00
parent cbfb9a5a93
commit 8c55df82fa
2 changed files with 99 additions and 3 deletions

View file

@ -232,6 +232,8 @@ def handler():
# We want to estimate for making improvements on different property components
response = "environment-impact-current"
# We could potentially build models by constituency to avoid having too many
# features in the model
base_features = [
"property-type",
"built-form",
@ -258,14 +260,33 @@ def handler():
)
# Take just entries with U-values
model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])]
model_data = model_data[base_features + ["thermal_transmittance", response]]
# We need to split the data into a train and test set for model build
categorical_cols = [
"property-type", "built-form", "number-habitable-rooms", "constituency",
]
# If these categorical variables are not of type 'category', convert them
for col in categorical_cols:
model_data[col] = model_data[col].astype('category')
# Dummy out the categorical variables
training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
# Convert booleans to integer
for col in training_data.columns:
if training_data[col].dtype == bool:
training_data[col] = training_data[col].astype(int)
if training_data[col].dtype == object:
training_data[col] = training_data[col].astype(float)
import statsmodels.api as sm
# Assuming 'df' is your DataFrame
X = model_data[base_features + ["thermal_transmittance"]]
Y = model_data[response]
X = training_data.drop(columns=response)
Y = training_data[response]
# Add a constant to the independent value
X1 = sm.add_constant(X)
@ -277,6 +298,80 @@ def handler():
results = model.fit()
print(results.summary())
import matplotlib.pyplot as plt
import numpy as np
def plot_regression(df):
# Extract the "fit" and "actual" columns from the dataframe
fit = df['fit']
actual = df['actual']
# Create an array of x-values (assumed to be sequential integers)
x = np.arange(len(df))
# Plot the fit and actual data
plt.plot(x, fit, color='red', label='Fit')
plt.plot(x, actual, color='blue', label='Actual')
# Set labels and title
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Linear Regression - Fit vs Actual')
# Display legend
plt.legend()
# Show the plot
plt.show()
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
median_absolute_error
def calculate_regression_metrics(y_true, y_pred, n=20):
"""
Calculate the 5 most important accuracy metrics for regression.
Args:
y_true (array-like): Array of true target values.
y_pred (array-like): Array of predicted target values.
Returns:
dict: Dictionary containing the calculated metrics.
"""
metrics = {}
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
metrics['R2 Score'] = r2_score(y_true, y_pred)
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
errors = pd.DataFrame()
errors['Fit'] = y_true
errors['Actual'] = y_pred
errors['Residual'] = errors['Actual'] - errors['Fit']
errors['Absolute Residual'] = np.abs(errors['Residual'])
worst_errors = errors.nlargest(n, 'Absolute Residual')
return metrics, worst_errors
fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
worst_x = model_data[model_data.index.isin(worst_errors.index)]
# The worst errors over index heavily for flats
fit_df = pd.DataFrame(
{
"fit": results.fittedvalues,
"actual": Y
}
)
# Sort on magnitude of actual
fit_df = fit_df.sort_values("actual", ascending=True)
plot_regression(fit_df)
model_data[["thermal_transmittance", response]].corr()
summary = model_data.groupby(["property-type", "built-form"], observed=True)[

View file

@ -15,4 +15,5 @@ pint
geopandas
mip
seaborn
statsmodels
statsmodels
scikit-learn