diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py new file mode 100644 index 00000000..3d84d193 --- /dev/null +++ b/model_data/analysis/SapModel.py @@ -0,0 +1,170 @@ +import numpy as np +import pandas as pd +import statsmodels.api as sm +import matplotlib.pyplot as plt +import pickle +from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ + median_absolute_error, mean_absolute_percentage_error + +with open("all_data.pkl", "wb") as f: + all_data = pickle.load(f) + + +class SalModel: + # We want to estimate for making improvements on different property components + RESPONSE = "environment-impact-current" + # We could potentially build models by constituency to avoid having too many + # features in the model + BASE_FEATURES = [ + "property-type", + "built-form", + # "construction-age-band", + "number-habitable-rooms", + "constituency", + "number-heated-rooms", + ] + + COMPONENT_FEATURES = [ + "walls-description", + "floor-description", + "lighting-description", + "windows-description", + "roof-description", + "mainheat-description", + "main-fuel" + ] + + CATEGORICAL_COLS = [ + "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms", + "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel", + + ] + + def __init__(self, data, cleaner): + self.df = pd.DataFrame(data) + self.cleaner = cleaner + + self.train_x = None + self.train_y = None + self.results = None + self.model_data = None + self.fit_error = None + self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()} + + def create_dataset(self): + model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES] + model_data = model_data.reset_index(drop=True) + model_data["idx"] = model_data.index.copy() + + # Append on u-value estimates + model_data = model_data.merge( + pd.DataFrame(self.cleaner.cleaned["walls-description"])[ + ["original_description", "thermal_transmittance"]].rename( + columns={"thermal_transmittance": "walls_u_value", } + ), + how="left", + left_on="walls-description", + right_on="original_description" + ) \ + .drop(columns=["original_description"]) \ + .merge( + pd.DataFrame(self.cleaner.cleaned["floor-description"])[ + ["original_description", "thermal_transmittance"]].rename( + columns={"thermal_transmittance": "floor_u_value", } + ), + how="left", + left_on="floor-description", + right_on="original_description" + ) + # Take just entries with U-values + model_data = model_data[ + ~pd.isnull(model_data["walls_u_value"]) & + ~pd.isnull(model_data["floor_u_value"]) + ] + model_data = model_data[ + self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [ + "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE] + ] + + for col in self.CATEGORICAL_COLS: + model_data[col] = model_data[col].astype('category') + + def make_training_test(self): + # Split into training and test + # Dummy data + pass + + def fit_model(self): + # Add a constant to the independent value + x1 = sm.add_constant(self.X) + + # make regression model + model = sm.OLS(self.Y, x1) + + # fit model and print results + self.results = model.fit() + + self.fit_error, self.worst["errors"] = self.calculate_regression_metrics( + y_true=self.Y, y_pred=self.results.fittedvalues + ) + + self.model_data['fit'] = self.results.fittedvalues + # The worst errors over index heavily for flats + self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)] + + @staticmethod + def plot_regression(df): + # Extract the "fit" and "actual" columns from the dataframe + fit = df['fit'] + actual = df['actual'] + + # Create an array of x-values (assumed to be sequential integers) + x = np.arange(len(df)) + + # Plot the fit and actual data + plt.plot(x, fit, color='red', label='Fit') + plt.plot(x, actual, color='blue', label='Actual') + + # Set labels and title + plt.xlabel('Index') + plt.ylabel('Value') + plt.title('Linear Regression - Fit vs Actual') + + # Display legend + plt.legend() + + # Show the plot + plt.show() + + @staticmethod + def calculate_regression_metrics(y_true, y_pred, n=20): + """ + Calculate the 5 most important accuracy metrics for regression. + + Args: + y_true (array-like): Array of true target values. + y_pred (array-like): Array of predicted target values. + + Returns: + dict: Dictionary containing the calculated metrics. + """ + metrics = {} + + metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred) + metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred) + metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred) + metrics['R2 Score'] = r2_score(y_true, y_pred) + metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred) + metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred) + metrics['Mean True Value'] = y_true.mean() + metrics['Mean Predicted Value'] = y_pred.mean() + + errors = pd.DataFrame() + errors['Fit'] = y_true + errors['Actual'] = y_pred + errors['Residual'] = errors['Actual'] - errors['Fit'] + errors['Absolute Residual'] = np.abs(errors['Residual']) + + worst_errors = errors.nlargest(n, 'Absolute Residual') + + return metrics, worst_errors diff --git a/model_data/app.py b/model_data/app.py index c7aed923..c0159e1d 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -239,71 +239,9 @@ def handler(): pd.set_option('display.width', 1000) df = pd.DataFrame(data) - # We want to estimate for making improvements on different property components - response = "environment-impact-current" - # We could potentially build models by constituency to avoid having too many - # features in the model - base_features = [ - "property-type", - "built-form", - # "construction-age-band", - "number-habitable-rooms", - "constituency", - "number-heated-rooms", - ] - - component_features = [ - "walls-description", - "floor-description", - "lighting-description", - "windows-description", - "roof-description", - "mainheat-description", - "main-fuel" - ] - - model_data = df[[response] + component_features + base_features] - model_data = model_data.reset_index(drop=True) - model_data["idx"] = model_data.index.copy() - - # Append on u-value estimates - model_data = model_data.merge( - pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename( - columns={"thermal_transmittance": "walls_u_value", } - ), - how="left", - left_on="walls-description", - right_on="original_description" - ) \ - .drop(columns=["original_description"]) \ - .merge( - pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename( - columns={"thermal_transmittance": "floor_u_value", } - ), - how="left", - left_on="floor-description", - right_on="original_description" - ) - # Take just entries with U-values - model_data = model_data[ - ~pd.isnull(model_data["walls_u_value"]) & - ~pd.isnull(model_data["floor_u_value"]) - ] - model_data = model_data[ - base_features + [c for c in component_features if c not in [ - "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response] - ] - # We need to split the data into a train and test set for model build - categorical_cols = [ - "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms", - "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel", - - ] # If these categorical variables are not of type 'category', convert them - for col in categorical_cols: - model_data[col] = model_data[col].astype('category') # Dummy out the categorical variables training_data = pd.get_dummies(model_data, columns=categorical_cols, drop_first=True) @@ -316,88 +254,16 @@ def handler(): if training_data[col].dtype == object: training_data[col] = training_data[col].astype(float) - import statsmodels.api as sm - # Assuming 'df' is your DataFrame X = training_data.drop(columns=response) Y = training_data[response] - # Add a constant to the independent value - X1 = sm.add_constant(X) - - # make regression model - model = sm.OLS(Y, X1) - - # fit model and print results - results = model.fit() print(results.summary()) import matplotlib.pyplot as plt import numpy as np - def plot_regression(df): - # Extract the "fit" and "actual" columns from the dataframe - fit = df['fit'] - actual = df['actual'] - - # Create an array of x-values (assumed to be sequential integers) - x = np.arange(len(df)) - - # Plot the fit and actual data - plt.plot(x, fit, color='red', label='Fit') - plt.plot(x, actual, color='blue', label='Actual') - - # Set labels and title - plt.xlabel('Index') - plt.ylabel('Value') - plt.title('Linear Regression - Fit vs Actual') - - # Display legend - plt.legend() - - # Show the plot - plt.show() import numpy as np - from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ - median_absolute_error, mean_absolute_percentage_error - - def calculate_regression_metrics(y_true, y_pred, n=20): - """ - Calculate the 5 most important accuracy metrics for regression. - - Args: - y_true (array-like): Array of true target values. - y_pred (array-like): Array of predicted target values. - - Returns: - dict: Dictionary containing the calculated metrics. - """ - metrics = {} - - metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred) - metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred) - metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred) - metrics['R2 Score'] = r2_score(y_true, y_pred) - metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred) - metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred) - metrics['Mean True Value'] = y_true.mean() - metrics['Mean Predicted Value'] = y_pred.mean() - - errors = pd.DataFrame() - errors['Fit'] = y_true - errors['Actual'] = y_pred - errors['Residual'] = errors['Actual'] - errors['Fit'] - errors['Absolute Residual'] = np.abs(errors['Residual']) - - worst_errors = errors.nlargest(n, 'Absolute Residual') - - return metrics, worst_errors - - fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues) - - model_data['fit'] = results.fittedvalues - # The worst errors over index heavily for flats - worst_x = model_data[model_data.index.isin(worst_errors.index)] # Notes # TODO: We might want to look at adding in the u-value estimates for the properties that do not have them