""" Generate metrics and enable regeneration of metrics if new metrics are generated Key tasks: - Specify metric functions that take in prediction vs actual to generate a metric value - Given a model and test data, produce a suite of all metrics """ import pandas as pd from pathlib import Path import seaborn as sns import matplotlib.pyplot as plt from core.Settings import ( RESIDUAL_TRUE_LABEL, RESIDUAL_PREDICTION_LABEL, SEABORN_RESIDUAL_AXIS_FONTSIZE, SEABORN_RESIDUAL_TITLE_FONTSIZE, SEABORN_RESIDUAL_STYLE, SEABORN_RESIDUAL_ASPECT_RATIO, SEABORN_RESIDUAL_PLOT_DPI, SEABORN_RESIDUAL_RANGE, SEABORN_RESIDUAL_LINE_COLOUR, SEABORN_RESIDUAL_LINE_WIDTH, ) from sklearn.metrics import ( mean_absolute_error, median_absolute_error, mean_squared_error, mean_absolute_percentage_error, ) # Dummy example of new metric that can be added - must be true and prediction as arguments def max_error(y_true: pd.Series, y_pred: pd.Series): return max(y_true - y_pred) METRIC_TO_APPLY = [ mean_absolute_error, median_absolute_error, mean_squared_error, mean_absolute_percentage_error, # max_error ] def sort_by_metric( data: pd.DataFrame, optimse_metric: str, best_model_column_name: str ) -> pd.DataFrame: """ Helper function to sort data frame by metric and append a best model flag """ # Ascending as we want lowest error values data = data.sort_values(optimse_metric, ascending=True).reset_index(drop=True) data[best_model_column_name] = [False] * len(data) data.loc[0, best_model_column_name] = True return data class Metrics: """ All metric functions used to generate a dictionary of metrics """ @staticmethod def list_metric_functions() -> list: """ Gather all metric functions to run """ return [metric_to_apply.__name__ for metric_to_apply in METRIC_TO_APPLY] @staticmethod def generate_metric_suite(actuals: pd.Series, predictions: pd.Series) -> pd.Series: """ For the model, test data and target, generate predictions and then iterative over all metrics to generate a Series of metric values """ metric_dict = {} for metric_function in METRIC_TO_APPLY: metric_dict[metric_function.__name__] = metric_function( actuals, predictions ) metrics = pd.Series(metric_dict) return metrics @staticmethod def generate_plot_suite(): """ Can do all metric ploting """ @staticmethod def generate_residual_plot( actuals: pd.Series, predictions: pd.Series, target_column: str, output_filepath: Path | str, ): # TODO: can have a model.metric_outputs method # FOr not just do it here residual_df = pd.DataFrame( list(zip(actuals, predictions)), columns=[RESIDUAL_TRUE_LABEL, RESIDUAL_PREDICTION_LABEL], ) # image formatting sns.set(style=SEABORN_RESIDUAL_STYLE) ax = sns.scatterplot( x=RESIDUAL_TRUE_LABEL, y=RESIDUAL_PREDICTION_LABEL, data=residual_df ) ax.set_aspect(SEABORN_RESIDUAL_ASPECT_RATIO) ax.set_xlabel(f"True {target_column}", fontsize=SEABORN_RESIDUAL_AXIS_FONTSIZE) ax.set_ylabel( f"Predicted {target_column}", fontsize=SEABORN_RESIDUAL_AXIS_FONTSIZE ) # ylabel ax.set_title("Residuals", fontsize=SEABORN_RESIDUAL_TITLE_FONTSIZE) # Square aspect ratio ax.plot( SEABORN_RESIDUAL_RANGE, SEABORN_RESIDUAL_RANGE, SEABORN_RESIDUAL_LINE_COLOUR, linewidth=SEABORN_RESIDUAL_LINE_WIDTH, ) plt.tight_layout() plt.savefig(output_filepath, dpi=SEABORN_RESIDUAL_PLOT_DPI)