creating rough framework for sap model

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-03 13:01:41 +01:00
parent c08f74cefb
commit f941a3c512
2 changed files with 170 additions and 134 deletions

View file

@ -0,0 +1,170 @@
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
median_absolute_error, mean_absolute_percentage_error
with open("all_data.pkl", "wb") as f:
all_data = pickle.load(f)
class SalModel:
# We want to estimate for making improvements on different property components
RESPONSE = "environment-impact-current"
# We could potentially build models by constituency to avoid having too many
# features in the model
BASE_FEATURES = [
"property-type",
"built-form",
# "construction-age-band",
"number-habitable-rooms",
"constituency",
"number-heated-rooms",
]
COMPONENT_FEATURES = [
"walls-description",
"floor-description",
"lighting-description",
"windows-description",
"roof-description",
"mainheat-description",
"main-fuel"
]
CATEGORICAL_COLS = [
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
]
def __init__(self, data, cleaner):
self.df = pd.DataFrame(data)
self.cleaner = cleaner
self.train_x = None
self.train_y = None
self.results = None
self.model_data = None
self.fit_error = None
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
def create_dataset(self):
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
model_data = model_data.reset_index(drop=True)
model_data["idx"] = model_data.index.copy()
# Append on u-value estimates
model_data = model_data.merge(
pd.DataFrame(self.cleaner.cleaned["walls-description"])[
["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "walls_u_value", }
),
how="left",
left_on="walls-description",
right_on="original_description"
) \
.drop(columns=["original_description"]) \
.merge(
pd.DataFrame(self.cleaner.cleaned["floor-description"])[
["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "floor_u_value", }
),
how="left",
left_on="floor-description",
right_on="original_description"
)
# Take just entries with U-values
model_data = model_data[
~pd.isnull(model_data["walls_u_value"]) &
~pd.isnull(model_data["floor_u_value"])
]
model_data = model_data[
self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE]
]
for col in self.CATEGORICAL_COLS:
model_data[col] = model_data[col].astype('category')
def make_training_test(self):
# Split into training and test
# Dummy data
pass
def fit_model(self):
# Add a constant to the independent value
x1 = sm.add_constant(self.X)
# make regression model
model = sm.OLS(self.Y, x1)
# fit model and print results
self.results = model.fit()
self.fit_error, self.worst["errors"] = self.calculate_regression_metrics(
y_true=self.Y, y_pred=self.results.fittedvalues
)
self.model_data['fit'] = self.results.fittedvalues
# The worst errors over index heavily for flats
self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)]
@staticmethod
def plot_regression(df):
# Extract the "fit" and "actual" columns from the dataframe
fit = df['fit']
actual = df['actual']
# Create an array of x-values (assumed to be sequential integers)
x = np.arange(len(df))
# Plot the fit and actual data
plt.plot(x, fit, color='red', label='Fit')
plt.plot(x, actual, color='blue', label='Actual')
# Set labels and title
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Linear Regression - Fit vs Actual')
# Display legend
plt.legend()
# Show the plot
plt.show()
@staticmethod
def calculate_regression_metrics(y_true, y_pred, n=20):
"""
Calculate the 5 most important accuracy metrics for regression.
Args:
y_true (array-like): Array of true target values.
y_pred (array-like): Array of predicted target values.
Returns:
dict: Dictionary containing the calculated metrics.
"""
metrics = {}
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
metrics['R2 Score'] = r2_score(y_true, y_pred)
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
metrics['Mean True Value'] = y_true.mean()
metrics['Mean Predicted Value'] = y_pred.mean()
errors = pd.DataFrame()
errors['Fit'] = y_true
errors['Actual'] = y_pred
errors['Residual'] = errors['Actual'] - errors['Fit']
errors['Absolute Residual'] = np.abs(errors['Residual'])
worst_errors = errors.nlargest(n, 'Absolute Residual')
return metrics, worst_errors

View file

@ -239,71 +239,9 @@ def handler():
pd.set_option('display.width', 1000)
df = pd.DataFrame(data)
# We want to estimate for making improvements on different property components
response = "environment-impact-current"
# We could potentially build models by constituency to avoid having too many
# features in the model
base_features = [
"property-type",
"built-form",
# "construction-age-band",
"number-habitable-rooms",
"constituency",
"number-heated-rooms",
]
component_features = [
"walls-description",
"floor-description",
"lighting-description",
"windows-description",
"roof-description",
"mainheat-description",
"main-fuel"
]
model_data = df[[response] + component_features + base_features]
model_data = model_data.reset_index(drop=True)
model_data["idx"] = model_data.index.copy()
# Append on u-value estimates
model_data = model_data.merge(
pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "walls_u_value", }
),
how="left",
left_on="walls-description",
right_on="original_description"
) \
.drop(columns=["original_description"]) \
.merge(
pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "floor_u_value", }
),
how="left",
left_on="floor-description",
right_on="original_description"
)
# Take just entries with U-values
model_data = model_data[
~pd.isnull(model_data["walls_u_value"]) &
~pd.isnull(model_data["floor_u_value"])
]
model_data = model_data[
base_features + [c for c in component_features if c not in [
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response]
]
# We need to split the data into a train and test set for model build
categorical_cols = [
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
]
# If these categorical variables are not of type 'category', convert them
for col in categorical_cols:
model_data[col] = model_data[col].astype('category')
# Dummy out the categorical variables
training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True)
@ -316,88 +254,16 @@ def handler():
if training_data[col].dtype == object:
training_data[col] = training_data[col].astype(float)
import statsmodels.api as sm
# Assuming 'df' is your DataFrame
X = training_data.drop(columns=response)
Y = training_data[response]
# Add a constant to the independent value
X1 = sm.add_constant(X)
# make regression model
model = sm.OLS(Y, X1)
# fit model and print results
results = model.fit()
print(results.summary())
import matplotlib.pyplot as plt
import numpy as np
def plot_regression(df):
# Extract the "fit" and "actual" columns from the dataframe
fit = df['fit']
actual = df['actual']
# Create an array of x-values (assumed to be sequential integers)
x = np.arange(len(df))
# Plot the fit and actual data
plt.plot(x, fit, color='red', label='Fit')
plt.plot(x, actual, color='blue', label='Actual')
# Set labels and title
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Linear Regression - Fit vs Actual')
# Display legend
plt.legend()
# Show the plot
plt.show()
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
median_absolute_error, mean_absolute_percentage_error
def calculate_regression_metrics(y_true, y_pred, n=20):
"""
Calculate the 5 most important accuracy metrics for regression.
Args:
y_true (array-like): Array of true target values.
y_pred (array-like): Array of predicted target values.
Returns:
dict: Dictionary containing the calculated metrics.
"""
metrics = {}
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
metrics['R2 Score'] = r2_score(y_true, y_pred)
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
metrics['Mean True Value'] = y_true.mean()
metrics['Mean Predicted Value'] = y_pred.mean()
errors = pd.DataFrame()
errors['Fit'] = y_true
errors['Actual'] = y_pred
errors['Residual'] = errors['Actual'] - errors['Fit']
errors['Absolute Residual'] = np.abs(errors['Residual'])
worst_errors = errors.nlargest(n, 'Absolute Residual')
return metrics, worst_errors
fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
model_data['fit'] = results.fittedvalues
# The worst errors over index heavily for flats
worst_x = model_data[model_data.index.isin(worst_errors.index)]
# Notes
# TODO: We might want to look at adding in the u-value estimates for the properties that do not have them