import numpy as np import pandas as pd import statsmodels.api as sm import matplotlib.pyplot as plt import pickle from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ median_absolute_error, mean_absolute_percentage_error with open("all_data.pkl", "wb") as f: all_data = pickle.load(f) class SalModel: # We want to estimate for making improvements on different property components RESPONSE = "environment-impact-current" # We could potentially build models by constituency to avoid having too many # features in the model BASE_FEATURES = [ "property-type", "built-form", # "construction-age-band", "number-habitable-rooms", "constituency", "number-heated-rooms", ] COMPONENT_FEATURES = [ "walls-description", "floor-description", "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel" ] CATEGORICAL_COLS = [ "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms", "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel", ] def __init__(self, data, cleaner): self.df = pd.DataFrame(data) self.cleaner = cleaner self.train_x = None self.train_y = None self.results = None self.model_data = None self.fit_error = None self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()} def create_dataset(self): model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES] model_data = model_data.reset_index(drop=True) model_data["idx"] = model_data.index.copy() # Append on u-value estimates model_data = model_data.merge( pd.DataFrame(self.cleaner.cleaned["walls-description"])[ ["original_description", "thermal_transmittance"]].rename( columns={"thermal_transmittance": "walls_u_value", } ), how="left", left_on="walls-description", right_on="original_description" ) \ .drop(columns=["original_description"]) \ .merge( pd.DataFrame(self.cleaner.cleaned["floor-description"])[ ["original_description", "thermal_transmittance"]].rename( columns={"thermal_transmittance": "floor_u_value", } ), how="left", left_on="floor-description", right_on="original_description" ) # Take just entries with U-values model_data = model_data[ ~pd.isnull(model_data["walls_u_value"]) & ~pd.isnull(model_data["floor_u_value"]) ] model_data = model_data[ self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [ "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE] ] for col in self.CATEGORICAL_COLS: model_data[col] = model_data[col].astype('category') def make_training_test(self): # Split into training and test # Dummy data pass def fit_model(self): # Add a constant to the independent value x1 = sm.add_constant(self.X) # make regression model model = sm.OLS(self.Y, x1) # fit model and print results self.results = model.fit() self.fit_error, self.worst["errors"] = self.calculate_regression_metrics( y_true=self.Y, y_pred=self.results.fittedvalues ) self.model_data['fit'] = self.results.fittedvalues # The worst errors over index heavily for flats self.worst["x"] = self.model_data[self.model_data.index.isin(self.worst["errors"].index)] @staticmethod def plot_regression(df): # Extract the "fit" and "actual" columns from the dataframe fit = df['fit'] actual = df['actual'] # Create an array of x-values (assumed to be sequential integers) x = np.arange(len(df)) # Plot the fit and actual data plt.plot(x, fit, color='red', label='Fit') plt.plot(x, actual, color='blue', label='Actual') # Set labels and title plt.xlabel('Index') plt.ylabel('Value') plt.title('Linear Regression - Fit vs Actual') # Display legend plt.legend() # Show the plot plt.show() @staticmethod def calculate_regression_metrics(y_true, y_pred, n=20): """ Calculate the 5 most important accuracy metrics for regression. Args: y_true (array-like): Array of true target values. y_pred (array-like): Array of predicted target values. Returns: dict: Dictionary containing the calculated metrics. """ metrics = {} metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred) metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred) metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred) metrics['R2 Score'] = r2_score(y_true, y_pred) metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred) metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred) metrics['Mean True Value'] = y_true.mean() metrics['Mean Predicted Value'] = y_pred.mean() errors = pd.DataFrame() errors['Fit'] = y_true errors['Actual'] = y_pred errors['Residual'] = errors['Actual'] - errors['Fit'] errors['Absolute Residual'] = np.abs(errors['Residual']) worst_errors = errors.nlargest(n, 'Absolute Residual') return metrics, worst_errors