mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
poc sap model wip, probably need full panel of data
This commit is contained in:
parent
cbfb9a5a93
commit
8c55df82fa
2 changed files with 99 additions and 3 deletions
|
|
@ -232,6 +232,8 @@ def handler():
|
|||
|
||||
# We want to estimate for making improvements on different property components
|
||||
response = "environment-impact-current"
|
||||
# We could potentially build models by constituency to avoid having too many
|
||||
# features in the model
|
||||
base_features = [
|
||||
"property-type",
|
||||
"built-form",
|
||||
|
|
@ -258,14 +260,33 @@ def handler():
|
|||
)
|
||||
# Take just entries with U-values
|
||||
model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])]
|
||||
model_data = model_data[base_features + ["thermal_transmittance", response]]
|
||||
|
||||
# We need to split the data into a train and test set for model build
|
||||
categorical_cols = [
|
||||
"property-type", "built-form", "number-habitable-rooms", "constituency",
|
||||
]
|
||||
|
||||
# If these categorical variables are not of type 'category', convert them
|
||||
for col in categorical_cols:
|
||||
model_data[col] = model_data[col].astype('category')
|
||||
|
||||
# Dummy out the categorical variables
|
||||
training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
|
||||
|
||||
# Convert booleans to integer
|
||||
for col in training_data.columns:
|
||||
if training_data[col].dtype == bool:
|
||||
training_data[col] = training_data[col].astype(int)
|
||||
|
||||
if training_data[col].dtype == object:
|
||||
training_data[col] = training_data[col].astype(float)
|
||||
|
||||
import statsmodels.api as sm
|
||||
|
||||
# Assuming 'df' is your DataFrame
|
||||
X = model_data[base_features + ["thermal_transmittance"]]
|
||||
Y = model_data[response]
|
||||
X = training_data.drop(columns=response)
|
||||
Y = training_data[response]
|
||||
|
||||
# Add a constant to the independent value
|
||||
X1 = sm.add_constant(X)
|
||||
|
|
@ -277,6 +298,80 @@ def handler():
|
|||
results = model.fit()
|
||||
print(results.summary())
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
def plot_regression(df):
|
||||
# Extract the "fit" and "actual" columns from the dataframe
|
||||
fit = df['fit']
|
||||
actual = df['actual']
|
||||
|
||||
# Create an array of x-values (assumed to be sequential integers)
|
||||
x = np.arange(len(df))
|
||||
|
||||
# Plot the fit and actual data
|
||||
plt.plot(x, fit, color='red', label='Fit')
|
||||
plt.plot(x, actual, color='blue', label='Actual')
|
||||
|
||||
# Set labels and title
|
||||
plt.xlabel('Index')
|
||||
plt.ylabel('Value')
|
||||
plt.title('Linear Regression - Fit vs Actual')
|
||||
|
||||
# Display legend
|
||||
plt.legend()
|
||||
|
||||
# Show the plot
|
||||
plt.show()
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||
median_absolute_error
|
||||
|
||||
def calculate_regression_metrics(y_true, y_pred, n=20):
|
||||
"""
|
||||
Calculate the 5 most important accuracy metrics for regression.
|
||||
|
||||
Args:
|
||||
y_true (array-like): Array of true target values.
|
||||
y_pred (array-like): Array of predicted target values.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing the calculated metrics.
|
||||
"""
|
||||
metrics = {}
|
||||
|
||||
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
|
||||
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
|
||||
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
||||
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
||||
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
||||
|
||||
errors = pd.DataFrame()
|
||||
errors['Fit'] = y_true
|
||||
errors['Actual'] = y_pred
|
||||
errors['Residual'] = errors['Actual'] - errors['Fit']
|
||||
errors['Absolute Residual'] = np.abs(errors['Residual'])
|
||||
|
||||
worst_errors = errors.nlargest(n, 'Absolute Residual')
|
||||
|
||||
return metrics, worst_errors
|
||||
|
||||
fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
|
||||
|
||||
worst_x = model_data[model_data.index.isin(worst_errors.index)]
|
||||
# The worst errors over index heavily for flats
|
||||
|
||||
fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": results.fittedvalues,
|
||||
"actual": Y
|
||||
}
|
||||
)
|
||||
|
||||
# Sort on magnitude of actual
|
||||
fit_df = fit_df.sort_values("actual", ascending=True)
|
||||
plot_regression(fit_df)
|
||||
|
||||
model_data[["thermal_transmittance", response]].corr()
|
||||
|
||||
summary = model_data.groupby(["property-type", "built-form"], observed=True)[
|
||||
|
|
|
|||
|
|
@ -15,4 +15,5 @@ pint
|
|||
geopandas
|
||||
mip
|
||||
seaborn
|
||||
statsmodels
|
||||
statsmodels
|
||||
scikit-learn
|
||||
Loading…
Add table
Reference in a new issue